xref: /openbmc/qemu/fpu/softfloat.c (revision 979582d0)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             float_raise(float_flag_input_denormal, s);                  \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         float_raise(float_flag_overflow, s);
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         float_raise(float_flag_overflow, s);
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 #define float_cmask(bit)  (1u << (bit))
473 
474 enum {
475     float_cmask_zero    = float_cmask(float_class_zero),
476     float_cmask_normal  = float_cmask(float_class_normal),
477     float_cmask_inf     = float_cmask(float_class_inf),
478     float_cmask_qnan    = float_cmask(float_class_qnan),
479     float_cmask_snan    = float_cmask(float_class_snan),
480 
481     float_cmask_infzero = float_cmask_zero | float_cmask_inf,
482     float_cmask_anynan  = float_cmask_qnan | float_cmask_snan,
483 };
484 
485 
486 /* Simple helpers for checking if, or what kind of, NaN we have */
487 static inline __attribute__((unused)) bool is_nan(FloatClass c)
488 {
489     return unlikely(c >= float_class_qnan);
490 }
491 
492 static inline __attribute__((unused)) bool is_snan(FloatClass c)
493 {
494     return c == float_class_snan;
495 }
496 
497 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
498 {
499     return c == float_class_qnan;
500 }
501 
502 /*
503  * Structure holding all of the decomposed parts of a float.
504  * The exponent is unbiased and the fraction is normalized.
505  *
506  * The fraction words are stored in big-endian word ordering,
507  * so that truncation from a larger format to a smaller format
508  * can be done simply by ignoring subsequent elements.
509  */
510 
511 typedef struct {
512     FloatClass cls;
513     bool sign;
514     int32_t exp;
515     union {
516         /* Routines that know the structure may reference the singular name. */
517         uint64_t frac;
518         /*
519          * Routines expanded with multiple structures reference "hi" and "lo"
520          * depending on the operation.  In FloatParts64, "hi" and "lo" are
521          * both the same word and aliased here.
522          */
523         uint64_t frac_hi;
524         uint64_t frac_lo;
525     };
526 } FloatParts64;
527 
528 typedef struct {
529     FloatClass cls;
530     bool sign;
531     int32_t exp;
532     uint64_t frac_hi;
533     uint64_t frac_lo;
534 } FloatParts128;
535 
536 /* These apply to the most significant word of each FloatPartsN. */
537 #define DECOMPOSED_BINARY_POINT    63
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 
540 /* Structure holding all of the relevant parameters for a format.
541  *   exp_size: the size of the exponent field
542  *   exp_bias: the offset applied to the exponent field
543  *   exp_max: the maximum normalised exponent
544  *   frac_size: the size of the fraction field
545  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
546  * The following are computed based the size of fraction
547  *   frac_lsb: least significant bit of fraction
548  *   frac_lsbm1: the bit below the least significant bit (for rounding)
549  *   round_mask/roundeven_mask: masks used for rounding
550  * The following optional modifiers are available:
551  *   arm_althp: handle ARM Alternative Half Precision
552  */
553 typedef struct {
554     int exp_size;
555     int exp_bias;
556     int exp_max;
557     int frac_size;
558     int frac_shift;
559     uint64_t frac_lsb;
560     uint64_t frac_lsbm1;
561     uint64_t round_mask;
562     uint64_t roundeven_mask;
563     bool arm_althp;
564 } FloatFmt;
565 
566 /* Expand fields based on the size of exponent and fraction */
567 #define FLOAT_PARAMS(E, F)                                           \
568     .exp_size       = E,                                             \
569     .exp_bias       = ((1 << E) - 1) >> 1,                           \
570     .exp_max        = (1 << E) - 1,                                  \
571     .frac_size      = F,                                             \
572     .frac_shift     = (-F - 1) & 63,                                 \
573     .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
574     .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
575     .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
576     .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
577 
578 static const FloatFmt float16_params = {
579     FLOAT_PARAMS(5, 10)
580 };
581 
582 static const FloatFmt float16_params_ahp = {
583     FLOAT_PARAMS(5, 10),
584     .arm_althp = true
585 };
586 
587 static const FloatFmt bfloat16_params = {
588     FLOAT_PARAMS(8, 7)
589 };
590 
591 static const FloatFmt float32_params = {
592     FLOAT_PARAMS(8, 23)
593 };
594 
595 static const FloatFmt float64_params = {
596     FLOAT_PARAMS(11, 52)
597 };
598 
599 static const FloatFmt float128_params = {
600     FLOAT_PARAMS(15, 112)
601 };
602 
603 /* Unpack a float to parts, but do not canonicalize.  */
604 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
605 {
606     const int f_size = fmt->frac_size;
607     const int e_size = fmt->exp_size;
608 
609     *r = (FloatParts64) {
610         .cls = float_class_unclassified,
611         .sign = extract64(raw, f_size + e_size, 1),
612         .exp = extract64(raw, f_size, e_size),
613         .frac = extract64(raw, 0, f_size)
614     };
615 }
616 
617 static inline void float16_unpack_raw(FloatParts64 *p, float16 f)
618 {
619     unpack_raw64(p, &float16_params, f);
620 }
621 
622 static inline void bfloat16_unpack_raw(FloatParts64 *p, bfloat16 f)
623 {
624     unpack_raw64(p, &bfloat16_params, f);
625 }
626 
627 static inline void float32_unpack_raw(FloatParts64 *p, float32 f)
628 {
629     unpack_raw64(p, &float32_params, f);
630 }
631 
632 static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
633 {
634     unpack_raw64(p, &float64_params, f);
635 }
636 
637 static void float128_unpack_raw(FloatParts128 *p, float128 f)
638 {
639     const int f_size = float128_params.frac_size - 64;
640     const int e_size = float128_params.exp_size;
641 
642     *p = (FloatParts128) {
643         .cls = float_class_unclassified,
644         .sign = extract64(f.high, f_size + e_size, 1),
645         .exp = extract64(f.high, f_size, e_size),
646         .frac_hi = extract64(f.high, 0, f_size),
647         .frac_lo = f.low,
648     };
649 }
650 
651 /* Pack a float from parts, but do not canonicalize.  */
652 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
653 {
654     const int f_size = fmt->frac_size;
655     const int e_size = fmt->exp_size;
656     uint64_t ret;
657 
658     ret = (uint64_t)p->sign << (f_size + e_size);
659     ret = deposit64(ret, f_size, e_size, p->exp);
660     ret = deposit64(ret, 0, f_size, p->frac);
661     return ret;
662 }
663 
664 static inline float16 float16_pack_raw(const FloatParts64 *p)
665 {
666     return make_float16(pack_raw64(p, &float16_params));
667 }
668 
669 static inline bfloat16 bfloat16_pack_raw(const FloatParts64 *p)
670 {
671     return pack_raw64(p, &bfloat16_params);
672 }
673 
674 static inline float32 float32_pack_raw(const FloatParts64 *p)
675 {
676     return make_float32(pack_raw64(p, &float32_params));
677 }
678 
679 static inline float64 float64_pack_raw(const FloatParts64 *p)
680 {
681     return make_float64(pack_raw64(p, &float64_params));
682 }
683 
684 static float128 float128_pack_raw(const FloatParts128 *p)
685 {
686     const int f_size = float128_params.frac_size - 64;
687     const int e_size = float128_params.exp_size;
688     uint64_t hi;
689 
690     hi = (uint64_t)p->sign << (f_size + e_size);
691     hi = deposit64(hi, f_size, e_size, p->exp);
692     hi = deposit64(hi, 0, f_size, p->frac_hi);
693     return make_float128(hi, p->frac_lo);
694 }
695 
696 /*----------------------------------------------------------------------------
697 | Functions and definitions to determine:  (1) whether tininess for underflow
698 | is detected before or after rounding by default, (2) what (if anything)
699 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
700 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
701 | are propagated from function inputs to output.  These details are target-
702 | specific.
703 *----------------------------------------------------------------------------*/
704 #include "softfloat-specialize.c.inc"
705 
706 #define PARTS_GENERIC_64_128(NAME, P) \
707     QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
708 
709 #define parts_default_nan(P, S)    PARTS_GENERIC_64_128(default_nan, P)(P, S)
710 #define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
711 
712 static void parts64_return_nan(FloatParts64 *a, float_status *s);
713 static void parts128_return_nan(FloatParts128 *a, float_status *s);
714 
715 #define parts_return_nan(P, S)     PARTS_GENERIC_64_128(return_nan, P)(P, S)
716 
717 static FloatParts64 *parts64_pick_nan(FloatParts64 *a, FloatParts64 *b,
718                                       float_status *s);
719 static FloatParts128 *parts128_pick_nan(FloatParts128 *a, FloatParts128 *b,
720                                         float_status *s);
721 
722 #define parts_pick_nan(A, B, S)    PARTS_GENERIC_64_128(pick_nan, A)(A, B, S)
723 
724 static FloatParts64 *parts64_pick_nan_muladd(FloatParts64 *a, FloatParts64 *b,
725                                              FloatParts64 *c, float_status *s,
726                                              int ab_mask, int abc_mask);
727 static FloatParts128 *parts128_pick_nan_muladd(FloatParts128 *a,
728                                                FloatParts128 *b,
729                                                FloatParts128 *c,
730                                                float_status *s,
731                                                int ab_mask, int abc_mask);
732 
733 #define parts_pick_nan_muladd(A, B, C, S, ABM, ABCM) \
734     PARTS_GENERIC_64_128(pick_nan_muladd, A)(A, B, C, S, ABM, ABCM)
735 
736 /*
737  * Helper functions for softfloat-parts.c.inc, per-size operations.
738  */
739 
740 #define FRAC_GENERIC_64_128(NAME, P) \
741     QEMU_GENERIC(P, (FloatParts128 *, frac128_##NAME), frac64_##NAME)
742 
743 static int frac64_cmp(FloatParts64 *a, FloatParts64 *b)
744 {
745     return a->frac == b->frac ? 0 : a->frac < b->frac ? -1 : 1;
746 }
747 
748 static int frac128_cmp(FloatParts128 *a, FloatParts128 *b)
749 {
750     uint64_t ta = a->frac_hi, tb = b->frac_hi;
751     if (ta == tb) {
752         ta = a->frac_lo, tb = b->frac_lo;
753         if (ta == tb) {
754             return 0;
755         }
756     }
757     return ta < tb ? -1 : 1;
758 }
759 
760 #define frac_cmp(A, B)  FRAC_GENERIC_64_128(cmp, A)(A, B)
761 
762 static void frac128_shl(FloatParts128 *a, int c)
763 {
764     shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
765 }
766 
767 #define frac_shl(A, C)             frac128_shl(A, C)
768 
769 static void frac128_shr(FloatParts128 *a, int c)
770 {
771     shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
772 }
773 
774 #define frac_shr(A, C)             frac128_shr(A, C)
775 
776 /* Canonicalize EXP and FRAC, setting CLS.  */
777 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
778                                   float_status *status)
779 {
780     if (part.exp == parm->exp_max && !parm->arm_althp) {
781         if (part.frac == 0) {
782             part.cls = float_class_inf;
783         } else {
784             part.frac <<= parm->frac_shift;
785             part.cls = (parts_is_snan_frac(part.frac, status)
786                         ? float_class_snan : float_class_qnan);
787         }
788     } else if (part.exp == 0) {
789         if (likely(part.frac == 0)) {
790             part.cls = float_class_zero;
791         } else if (status->flush_inputs_to_zero) {
792             float_raise(float_flag_input_denormal, status);
793             part.cls = float_class_zero;
794             part.frac = 0;
795         } else {
796             int shift = clz64(part.frac);
797             part.cls = float_class_normal;
798             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
799             part.frac <<= shift;
800         }
801     } else {
802         part.cls = float_class_normal;
803         part.exp -= parm->exp_bias;
804         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
805     }
806     return part;
807 }
808 
809 /* Round and uncanonicalize a floating-point number by parts. There
810  * are FRAC_SHIFT bits that may require rounding at the bottom of the
811  * fraction; these bits will be removed. The exponent will be biased
812  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
813  */
814 
815 static FloatParts64 round_canonical(FloatParts64 p, float_status *s,
816                                   const FloatFmt *parm)
817 {
818     const uint64_t frac_lsb = parm->frac_lsb;
819     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
820     const uint64_t round_mask = parm->round_mask;
821     const uint64_t roundeven_mask = parm->roundeven_mask;
822     const int exp_max = parm->exp_max;
823     const int frac_shift = parm->frac_shift;
824     uint64_t frac, inc;
825     int exp, flags = 0;
826     bool overflow_norm;
827 
828     frac = p.frac;
829     exp = p.exp;
830 
831     switch (p.cls) {
832     case float_class_normal:
833         switch (s->float_rounding_mode) {
834         case float_round_nearest_even:
835             overflow_norm = false;
836             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
837             break;
838         case float_round_ties_away:
839             overflow_norm = false;
840             inc = frac_lsbm1;
841             break;
842         case float_round_to_zero:
843             overflow_norm = true;
844             inc = 0;
845             break;
846         case float_round_up:
847             inc = p.sign ? 0 : round_mask;
848             overflow_norm = p.sign;
849             break;
850         case float_round_down:
851             inc = p.sign ? round_mask : 0;
852             overflow_norm = !p.sign;
853             break;
854         case float_round_to_odd:
855             overflow_norm = true;
856             inc = frac & frac_lsb ? 0 : round_mask;
857             break;
858         default:
859             g_assert_not_reached();
860         }
861 
862         exp += parm->exp_bias;
863         if (likely(exp > 0)) {
864             if (frac & round_mask) {
865                 flags |= float_flag_inexact;
866                 if (uadd64_overflow(frac, inc, &frac)) {
867                     frac = (frac >> 1) | DECOMPOSED_IMPLICIT_BIT;
868                     exp++;
869                 }
870             }
871             frac >>= frac_shift;
872 
873             if (parm->arm_althp) {
874                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
875                 if (unlikely(exp > exp_max)) {
876                     /* Overflow.  Return the maximum normal.  */
877                     flags = float_flag_invalid;
878                     exp = exp_max;
879                     frac = -1;
880                 }
881             } else if (unlikely(exp >= exp_max)) {
882                 flags |= float_flag_overflow | float_flag_inexact;
883                 if (overflow_norm) {
884                     exp = exp_max - 1;
885                     frac = -1;
886                 } else {
887                     p.cls = float_class_inf;
888                     goto do_inf;
889                 }
890             }
891         } else if (s->flush_to_zero) {
892             flags |= float_flag_output_denormal;
893             p.cls = float_class_zero;
894             goto do_zero;
895         } else {
896             bool is_tiny = s->tininess_before_rounding || (exp < 0);
897 
898             if (!is_tiny) {
899                 uint64_t discard;
900                 is_tiny = !uadd64_overflow(frac, inc, &discard);
901             }
902 
903             shift64RightJamming(frac, 1 - exp, &frac);
904             if (frac & round_mask) {
905                 /* Need to recompute round-to-even.  */
906                 switch (s->float_rounding_mode) {
907                 case float_round_nearest_even:
908                     inc = ((frac & roundeven_mask) != frac_lsbm1
909                            ? frac_lsbm1 : 0);
910                     break;
911                 case float_round_to_odd:
912                     inc = frac & frac_lsb ? 0 : round_mask;
913                     break;
914                 default:
915                     break;
916                 }
917                 flags |= float_flag_inexact;
918                 frac += inc;
919             }
920 
921             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
922             frac >>= frac_shift;
923 
924             if (is_tiny && (flags & float_flag_inexact)) {
925                 flags |= float_flag_underflow;
926             }
927             if (exp == 0 && frac == 0) {
928                 p.cls = float_class_zero;
929             }
930         }
931         break;
932 
933     case float_class_zero:
934     do_zero:
935         exp = 0;
936         frac = 0;
937         break;
938 
939     case float_class_inf:
940     do_inf:
941         assert(!parm->arm_althp);
942         exp = exp_max;
943         frac = 0;
944         break;
945 
946     case float_class_qnan:
947     case float_class_snan:
948         assert(!parm->arm_althp);
949         exp = exp_max;
950         frac >>= parm->frac_shift;
951         break;
952 
953     default:
954         g_assert_not_reached();
955     }
956 
957     float_raise(flags, s);
958     p.exp = exp;
959     p.frac = frac;
960     return p;
961 }
962 
963 
964 #define partsN(NAME)   parts64_##NAME
965 #define FloatPartsN    FloatParts64
966 
967 #include "softfloat-parts.c.inc"
968 
969 #undef  partsN
970 #undef  FloatPartsN
971 #define partsN(NAME)   parts128_##NAME
972 #define FloatPartsN    FloatParts128
973 
974 #include "softfloat-parts.c.inc"
975 
976 #undef  partsN
977 #undef  FloatPartsN
978 
979 /*
980  * Pack/unpack routines with a specific FloatFmt.
981  */
982 
983 static void float16a_unpack_canonical(FloatParts64 *p, float16 f,
984                                       float_status *s, const FloatFmt *params)
985 {
986     float16_unpack_raw(p, f);
987     *p = sf_canonicalize(*p, params, s);
988 }
989 
990 static void float16_unpack_canonical(FloatParts64 *p, float16 f,
991                                      float_status *s)
992 {
993     float16a_unpack_canonical(p, f, s, &float16_params);
994 }
995 
996 static void bfloat16_unpack_canonical(FloatParts64 *p, bfloat16 f,
997                                       float_status *s)
998 {
999     bfloat16_unpack_raw(p, f);
1000     *p = sf_canonicalize(*p, &bfloat16_params, s);
1001 }
1002 
1003 static float16 float16a_round_pack_canonical(FloatParts64 *p,
1004                                              float_status *s,
1005                                              const FloatFmt *params)
1006 {
1007     *p = round_canonical(*p, s, params);
1008     return float16_pack_raw(p);
1009 }
1010 
1011 static float16 float16_round_pack_canonical(FloatParts64 *p,
1012                                             float_status *s)
1013 {
1014     return float16a_round_pack_canonical(p, s, &float16_params);
1015 }
1016 
1017 static bfloat16 bfloat16_round_pack_canonical(FloatParts64 *p,
1018                                               float_status *s)
1019 {
1020     *p = round_canonical(*p, s, &bfloat16_params);
1021     return bfloat16_pack_raw(p);
1022 }
1023 
1024 static void float32_unpack_canonical(FloatParts64 *p, float32 f,
1025                                      float_status *s)
1026 {
1027     float32_unpack_raw(p, f);
1028     *p = sf_canonicalize(*p, &float32_params, s);
1029 }
1030 
1031 static float32 float32_round_pack_canonical(FloatParts64 *p,
1032                                             float_status *s)
1033 {
1034     *p = round_canonical(*p, s, &float32_params);
1035     return float32_pack_raw(p);
1036 }
1037 
1038 static void float64_unpack_canonical(FloatParts64 *p, float64 f,
1039                                      float_status *s)
1040 {
1041     float64_unpack_raw(p, f);
1042     *p = sf_canonicalize(*p, &float64_params, s);
1043 }
1044 
1045 static float64 float64_round_pack_canonical(FloatParts64 *p,
1046                                             float_status *s)
1047 {
1048     *p = round_canonical(*p, s, &float64_params);
1049     return float64_pack_raw(p);
1050 }
1051 
1052 /*
1053  * Returns the result of adding or subtracting the values of the
1054  * floating-point values `a' and `b'. The operation is performed
1055  * according to the IEC/IEEE Standard for Binary Floating-Point
1056  * Arithmetic.
1057  */
1058 
1059 static FloatParts64 addsub_floats(FloatParts64 a, FloatParts64 b, bool subtract,
1060                                 float_status *s)
1061 {
1062     bool a_sign = a.sign;
1063     bool b_sign = b.sign ^ subtract;
1064 
1065     if (a_sign != b_sign) {
1066         /* Subtraction */
1067 
1068         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1069             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
1070                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1071                 a.frac = a.frac - b.frac;
1072             } else {
1073                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1074                 a.frac = b.frac - a.frac;
1075                 a.exp = b.exp;
1076                 a_sign ^= 1;
1077             }
1078 
1079             if (a.frac == 0) {
1080                 a.cls = float_class_zero;
1081                 a.sign = s->float_rounding_mode == float_round_down;
1082             } else {
1083                 int shift = clz64(a.frac);
1084                 a.frac = a.frac << shift;
1085                 a.exp = a.exp - shift;
1086                 a.sign = a_sign;
1087             }
1088             return a;
1089         }
1090         if (is_nan(a.cls) || is_nan(b.cls)) {
1091             return *parts_pick_nan(&a, &b, s);
1092         }
1093         if (a.cls == float_class_inf) {
1094             if (b.cls == float_class_inf) {
1095                 float_raise(float_flag_invalid, s);
1096                 parts_default_nan(&a, s);
1097             }
1098             return a;
1099         }
1100         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1101             a.sign = s->float_rounding_mode == float_round_down;
1102             return a;
1103         }
1104         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1105             b.sign = a_sign ^ 1;
1106             return b;
1107         }
1108         if (b.cls == float_class_zero) {
1109             return a;
1110         }
1111     } else {
1112         /* Addition */
1113         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1114             if (a.exp > b.exp) {
1115                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1116             } else if (a.exp < b.exp) {
1117                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1118                 a.exp = b.exp;
1119             }
1120 
1121             if (uadd64_overflow(a.frac, b.frac, &a.frac)) {
1122                 shift64RightJamming(a.frac, 1, &a.frac);
1123                 a.frac |= DECOMPOSED_IMPLICIT_BIT;
1124                 a.exp += 1;
1125             }
1126             return a;
1127         }
1128         if (is_nan(a.cls) || is_nan(b.cls)) {
1129             return *parts_pick_nan(&a, &b, s);
1130         }
1131         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1132             return a;
1133         }
1134         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1135             b.sign = b_sign;
1136             return b;
1137         }
1138     }
1139     g_assert_not_reached();
1140 }
1141 
1142 /*
1143  * Returns the result of adding or subtracting the floating-point
1144  * values `a' and `b'. The operation is performed according to the
1145  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1146  */
1147 
1148 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1149 {
1150     FloatParts64 pa, pb, pr;
1151 
1152     float16_unpack_canonical(&pa, a, status);
1153     float16_unpack_canonical(&pb, b, status);
1154     pr = addsub_floats(pa, pb, false, status);
1155 
1156     return float16_round_pack_canonical(&pr, status);
1157 }
1158 
1159 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1160 {
1161     FloatParts64 pa, pb, pr;
1162 
1163     float16_unpack_canonical(&pa, a, status);
1164     float16_unpack_canonical(&pb, b, status);
1165     pr = addsub_floats(pa, pb, true, status);
1166 
1167     return float16_round_pack_canonical(&pr, status);
1168 }
1169 
1170 static float32 QEMU_SOFTFLOAT_ATTR
1171 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1172 {
1173     FloatParts64 pa, pb, pr;
1174 
1175     float32_unpack_canonical(&pa, a, status);
1176     float32_unpack_canonical(&pb, b, status);
1177     pr = addsub_floats(pa, pb, subtract, status);
1178 
1179     return float32_round_pack_canonical(&pr, status);
1180 }
1181 
1182 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1183 {
1184     return soft_f32_addsub(a, b, false, status);
1185 }
1186 
1187 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1188 {
1189     return soft_f32_addsub(a, b, true, status);
1190 }
1191 
1192 static float64 QEMU_SOFTFLOAT_ATTR
1193 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1194 {
1195     FloatParts64 pa, pb, pr;
1196 
1197     float64_unpack_canonical(&pa, a, status);
1198     float64_unpack_canonical(&pb, b, status);
1199     pr = addsub_floats(pa, pb, subtract, status);
1200 
1201     return float64_round_pack_canonical(&pr, status);
1202 }
1203 
1204 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1205 {
1206     return soft_f64_addsub(a, b, false, status);
1207 }
1208 
1209 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1210 {
1211     return soft_f64_addsub(a, b, true, status);
1212 }
1213 
1214 static float hard_f32_add(float a, float b)
1215 {
1216     return a + b;
1217 }
1218 
1219 static float hard_f32_sub(float a, float b)
1220 {
1221     return a - b;
1222 }
1223 
1224 static double hard_f64_add(double a, double b)
1225 {
1226     return a + b;
1227 }
1228 
1229 static double hard_f64_sub(double a, double b)
1230 {
1231     return a - b;
1232 }
1233 
1234 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1235 {
1236     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1237         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1238     }
1239     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1240 }
1241 
1242 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1243 {
1244     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1245         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1246     } else {
1247         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1248     }
1249 }
1250 
1251 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1252                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1253 {
1254     return float32_gen2(a, b, s, hard, soft,
1255                         f32_is_zon2, f32_addsubmul_post);
1256 }
1257 
1258 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1259                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1260 {
1261     return float64_gen2(a, b, s, hard, soft,
1262                         f64_is_zon2, f64_addsubmul_post);
1263 }
1264 
1265 float32 QEMU_FLATTEN
1266 float32_add(float32 a, float32 b, float_status *s)
1267 {
1268     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1269 }
1270 
1271 float32 QEMU_FLATTEN
1272 float32_sub(float32 a, float32 b, float_status *s)
1273 {
1274     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1275 }
1276 
1277 float64 QEMU_FLATTEN
1278 float64_add(float64 a, float64 b, float_status *s)
1279 {
1280     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1281 }
1282 
1283 float64 QEMU_FLATTEN
1284 float64_sub(float64 a, float64 b, float_status *s)
1285 {
1286     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1287 }
1288 
1289 /*
1290  * Returns the result of adding or subtracting the bfloat16
1291  * values `a' and `b'.
1292  */
1293 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1294 {
1295     FloatParts64 pa, pb, pr;
1296 
1297     bfloat16_unpack_canonical(&pa, a, status);
1298     bfloat16_unpack_canonical(&pb, b, status);
1299     pr = addsub_floats(pa, pb, false, status);
1300 
1301     return bfloat16_round_pack_canonical(&pr, status);
1302 }
1303 
1304 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1305 {
1306     FloatParts64 pa, pb, pr;
1307 
1308     bfloat16_unpack_canonical(&pa, a, status);
1309     bfloat16_unpack_canonical(&pb, b, status);
1310     pr = addsub_floats(pa, pb, true, status);
1311 
1312     return bfloat16_round_pack_canonical(&pr, status);
1313 }
1314 
1315 /*
1316  * Returns the result of multiplying the floating-point values `a' and
1317  * `b'. The operation is performed according to the IEC/IEEE Standard
1318  * for Binary Floating-Point Arithmetic.
1319  */
1320 
1321 static FloatParts64 mul_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1322 {
1323     bool sign = a.sign ^ b.sign;
1324 
1325     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1326         uint64_t hi, lo;
1327         int exp = a.exp + b.exp;
1328 
1329         mul64To128(a.frac, b.frac, &hi, &lo);
1330         if (hi & DECOMPOSED_IMPLICIT_BIT) {
1331             exp += 1;
1332         } else {
1333             hi <<= 1;
1334         }
1335         hi |= (lo != 0);
1336 
1337         /* Re-use a */
1338         a.exp = exp;
1339         a.sign = sign;
1340         a.frac = hi;
1341         return a;
1342     }
1343     /* handle all the NaN cases */
1344     if (is_nan(a.cls) || is_nan(b.cls)) {
1345         return *parts_pick_nan(&a, &b, s);
1346     }
1347     /* Inf * Zero == NaN */
1348     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1349         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1350         float_raise(float_flag_invalid, s);
1351         parts_default_nan(&a, s);
1352         return a;
1353     }
1354     /* Multiply by 0 or Inf */
1355     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1356         a.sign = sign;
1357         return a;
1358     }
1359     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1360         b.sign = sign;
1361         return b;
1362     }
1363     g_assert_not_reached();
1364 }
1365 
1366 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1367 {
1368     FloatParts64 pa, pb, pr;
1369 
1370     float16_unpack_canonical(&pa, a, status);
1371     float16_unpack_canonical(&pb, b, status);
1372     pr = mul_floats(pa, pb, status);
1373 
1374     return float16_round_pack_canonical(&pr, status);
1375 }
1376 
1377 static float32 QEMU_SOFTFLOAT_ATTR
1378 soft_f32_mul(float32 a, float32 b, float_status *status)
1379 {
1380     FloatParts64 pa, pb, pr;
1381 
1382     float32_unpack_canonical(&pa, a, status);
1383     float32_unpack_canonical(&pb, b, status);
1384     pr = mul_floats(pa, pb, status);
1385 
1386     return float32_round_pack_canonical(&pr, status);
1387 }
1388 
1389 static float64 QEMU_SOFTFLOAT_ATTR
1390 soft_f64_mul(float64 a, float64 b, float_status *status)
1391 {
1392     FloatParts64 pa, pb, pr;
1393 
1394     float64_unpack_canonical(&pa, a, status);
1395     float64_unpack_canonical(&pb, b, status);
1396     pr = mul_floats(pa, pb, status);
1397 
1398     return float64_round_pack_canonical(&pr, status);
1399 }
1400 
1401 static float hard_f32_mul(float a, float b)
1402 {
1403     return a * b;
1404 }
1405 
1406 static double hard_f64_mul(double a, double b)
1407 {
1408     return a * b;
1409 }
1410 
1411 float32 QEMU_FLATTEN
1412 float32_mul(float32 a, float32 b, float_status *s)
1413 {
1414     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1415                         f32_is_zon2, f32_addsubmul_post);
1416 }
1417 
1418 float64 QEMU_FLATTEN
1419 float64_mul(float64 a, float64 b, float_status *s)
1420 {
1421     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1422                         f64_is_zon2, f64_addsubmul_post);
1423 }
1424 
1425 /*
1426  * Returns the result of multiplying the bfloat16
1427  * values `a' and `b'.
1428  */
1429 
1430 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1431 {
1432     FloatParts64 pa, pb, pr;
1433 
1434     bfloat16_unpack_canonical(&pa, a, status);
1435     bfloat16_unpack_canonical(&pb, b, status);
1436     pr = mul_floats(pa, pb, status);
1437 
1438     return bfloat16_round_pack_canonical(&pr, status);
1439 }
1440 
1441 /*
1442  * Returns the result of multiplying the floating-point values `a' and
1443  * `b' then adding 'c', with no intermediate rounding step after the
1444  * multiplication. The operation is performed according to the
1445  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1446  * The flags argument allows the caller to select negation of the
1447  * addend, the intermediate product, or the final result. (The
1448  * difference between this and having the caller do a separate
1449  * negation is that negating externally will flip the sign bit on
1450  * NaNs.)
1451  */
1452 
1453 static FloatParts64 muladd_floats(FloatParts64 a, FloatParts64 b, FloatParts64 c,
1454                                 int flags, float_status *s)
1455 {
1456     bool inf_zero, p_sign;
1457     bool sign_flip = flags & float_muladd_negate_result;
1458     FloatClass p_class;
1459     uint64_t hi, lo;
1460     int p_exp;
1461     int ab_mask, abc_mask;
1462 
1463     ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
1464     abc_mask = float_cmask(c.cls) | ab_mask;
1465     inf_zero = ab_mask == float_cmask_infzero;
1466 
1467     /* It is implementation-defined whether the cases of (0,inf,qnan)
1468      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1469      * they return if they do), so we have to hand this information
1470      * off to the target-specific pick-a-NaN routine.
1471      */
1472     if (unlikely(abc_mask & float_cmask_anynan)) {
1473         return *parts_pick_nan_muladd(&a, &b, &c, s, ab_mask, abc_mask);
1474     }
1475 
1476     if (inf_zero) {
1477         float_raise(float_flag_invalid, s);
1478         parts_default_nan(&a, s);
1479         return a;
1480     }
1481 
1482     if (flags & float_muladd_negate_c) {
1483         c.sign ^= 1;
1484     }
1485 
1486     p_sign = a.sign ^ b.sign;
1487 
1488     if (flags & float_muladd_negate_product) {
1489         p_sign ^= 1;
1490     }
1491 
1492     if (ab_mask & float_cmask_inf) {
1493         p_class = float_class_inf;
1494     } else if (ab_mask & float_cmask_zero) {
1495         p_class = float_class_zero;
1496     } else {
1497         p_class = float_class_normal;
1498     }
1499 
1500     if (c.cls == float_class_inf) {
1501         if (p_class == float_class_inf && p_sign != c.sign) {
1502             float_raise(float_flag_invalid, s);
1503             parts_default_nan(&c, s);
1504         } else {
1505             c.sign ^= sign_flip;
1506         }
1507         return c;
1508     }
1509 
1510     if (p_class == float_class_inf) {
1511         a.cls = float_class_inf;
1512         a.sign = p_sign ^ sign_flip;
1513         return a;
1514     }
1515 
1516     if (p_class == float_class_zero) {
1517         if (c.cls == float_class_zero) {
1518             if (p_sign != c.sign) {
1519                 p_sign = s->float_rounding_mode == float_round_down;
1520             }
1521             c.sign = p_sign;
1522         } else if (flags & float_muladd_halve_result) {
1523             c.exp -= 1;
1524         }
1525         c.sign ^= sign_flip;
1526         return c;
1527     }
1528 
1529     /* a & b should be normals now... */
1530     assert(a.cls == float_class_normal &&
1531            b.cls == float_class_normal);
1532 
1533     p_exp = a.exp + b.exp;
1534 
1535     mul64To128(a.frac, b.frac, &hi, &lo);
1536 
1537     /* Renormalize to the msb. */
1538     if (hi & DECOMPOSED_IMPLICIT_BIT) {
1539         p_exp += 1;
1540     } else {
1541         shortShift128Left(hi, lo, 1, &hi, &lo);
1542     }
1543 
1544     /* + add/sub */
1545     if (c.cls != float_class_zero) {
1546         int exp_diff = p_exp - c.exp;
1547         if (p_sign == c.sign) {
1548             /* Addition */
1549             if (exp_diff <= 0) {
1550                 shift64RightJamming(hi, -exp_diff, &hi);
1551                 p_exp = c.exp;
1552                 if (uadd64_overflow(hi, c.frac, &hi)) {
1553                     shift64RightJamming(hi, 1, &hi);
1554                     hi |= DECOMPOSED_IMPLICIT_BIT;
1555                     p_exp += 1;
1556                 }
1557             } else {
1558                 uint64_t c_hi, c_lo, over;
1559                 shift128RightJamming(c.frac, 0, exp_diff, &c_hi, &c_lo);
1560                 add192(0, hi, lo, 0, c_hi, c_lo, &over, &hi, &lo);
1561                 if (over) {
1562                     shift64RightJamming(hi, 1, &hi);
1563                     hi |= DECOMPOSED_IMPLICIT_BIT;
1564                     p_exp += 1;
1565                 }
1566             }
1567         } else {
1568             /* Subtraction */
1569             uint64_t c_hi = c.frac, c_lo = 0;
1570 
1571             if (exp_diff <= 0) {
1572                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1573                 if (exp_diff == 0
1574                     &&
1575                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1576                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1577                 } else {
1578                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1579                     p_sign ^= 1;
1580                     p_exp = c.exp;
1581                 }
1582             } else {
1583                 shift128RightJamming(c_hi, c_lo,
1584                                      exp_diff,
1585                                      &c_hi, &c_lo);
1586                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1587             }
1588 
1589             if (hi == 0 && lo == 0) {
1590                 a.cls = float_class_zero;
1591                 a.sign = s->float_rounding_mode == float_round_down;
1592                 a.sign ^= sign_flip;
1593                 return a;
1594             } else {
1595                 int shift;
1596                 if (hi != 0) {
1597                     shift = clz64(hi);
1598                 } else {
1599                     shift = clz64(lo) + 64;
1600                 }
1601                 /* Normalizing to a binary point of 124 is the
1602                    correct adjust for the exponent.  However since we're
1603                    shifting, we might as well put the binary point back
1604                    at 63 where we really want it.  Therefore shift as
1605                    if we're leaving 1 bit at the top of the word, but
1606                    adjust the exponent as if we're leaving 3 bits.  */
1607                 shift128Left(hi, lo, shift, &hi, &lo);
1608                 p_exp -= shift;
1609             }
1610         }
1611     }
1612     hi |= (lo != 0);
1613 
1614     if (flags & float_muladd_halve_result) {
1615         p_exp -= 1;
1616     }
1617 
1618     /* finally prepare our result */
1619     a.cls = float_class_normal;
1620     a.sign = p_sign ^ sign_flip;
1621     a.exp = p_exp;
1622     a.frac = hi;
1623 
1624     return a;
1625 }
1626 
1627 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1628                                                 int flags, float_status *status)
1629 {
1630     FloatParts64 pa, pb, pc, pr;
1631 
1632     float16_unpack_canonical(&pa, a, status);
1633     float16_unpack_canonical(&pb, b, status);
1634     float16_unpack_canonical(&pc, c, status);
1635     pr = muladd_floats(pa, pb, pc, flags, status);
1636 
1637     return float16_round_pack_canonical(&pr, status);
1638 }
1639 
1640 static float32 QEMU_SOFTFLOAT_ATTR
1641 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1642                 float_status *status)
1643 {
1644     FloatParts64 pa, pb, pc, pr;
1645 
1646     float32_unpack_canonical(&pa, a, status);
1647     float32_unpack_canonical(&pb, b, status);
1648     float32_unpack_canonical(&pc, c, status);
1649     pr = muladd_floats(pa, pb, pc, flags, status);
1650 
1651     return float32_round_pack_canonical(&pr, status);
1652 }
1653 
1654 static float64 QEMU_SOFTFLOAT_ATTR
1655 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1656                 float_status *status)
1657 {
1658     FloatParts64 pa, pb, pc, pr;
1659 
1660     float64_unpack_canonical(&pa, a, status);
1661     float64_unpack_canonical(&pb, b, status);
1662     float64_unpack_canonical(&pc, c, status);
1663     pr = muladd_floats(pa, pb, pc, flags, status);
1664 
1665     return float64_round_pack_canonical(&pr, status);
1666 }
1667 
1668 static bool force_soft_fma;
1669 
1670 float32 QEMU_FLATTEN
1671 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1672 {
1673     union_float32 ua, ub, uc, ur;
1674 
1675     ua.s = xa;
1676     ub.s = xb;
1677     uc.s = xc;
1678 
1679     if (unlikely(!can_use_fpu(s))) {
1680         goto soft;
1681     }
1682     if (unlikely(flags & float_muladd_halve_result)) {
1683         goto soft;
1684     }
1685 
1686     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1687     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1688         goto soft;
1689     }
1690 
1691     if (unlikely(force_soft_fma)) {
1692         goto soft;
1693     }
1694 
1695     /*
1696      * When (a || b) == 0, there's no need to check for under/over flow,
1697      * since we know the addend is (normal || 0) and the product is 0.
1698      */
1699     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1700         union_float32 up;
1701         bool prod_sign;
1702 
1703         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1704         prod_sign ^= !!(flags & float_muladd_negate_product);
1705         up.s = float32_set_sign(float32_zero, prod_sign);
1706 
1707         if (flags & float_muladd_negate_c) {
1708             uc.h = -uc.h;
1709         }
1710         ur.h = up.h + uc.h;
1711     } else {
1712         union_float32 ua_orig = ua;
1713         union_float32 uc_orig = uc;
1714 
1715         if (flags & float_muladd_negate_product) {
1716             ua.h = -ua.h;
1717         }
1718         if (flags & float_muladd_negate_c) {
1719             uc.h = -uc.h;
1720         }
1721 
1722         ur.h = fmaf(ua.h, ub.h, uc.h);
1723 
1724         if (unlikely(f32_is_inf(ur))) {
1725             float_raise(float_flag_overflow, s);
1726         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1727             ua = ua_orig;
1728             uc = uc_orig;
1729             goto soft;
1730         }
1731     }
1732     if (flags & float_muladd_negate_result) {
1733         return float32_chs(ur.s);
1734     }
1735     return ur.s;
1736 
1737  soft:
1738     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1739 }
1740 
1741 float64 QEMU_FLATTEN
1742 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1743 {
1744     union_float64 ua, ub, uc, ur;
1745 
1746     ua.s = xa;
1747     ub.s = xb;
1748     uc.s = xc;
1749 
1750     if (unlikely(!can_use_fpu(s))) {
1751         goto soft;
1752     }
1753     if (unlikely(flags & float_muladd_halve_result)) {
1754         goto soft;
1755     }
1756 
1757     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1758     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1759         goto soft;
1760     }
1761 
1762     if (unlikely(force_soft_fma)) {
1763         goto soft;
1764     }
1765 
1766     /*
1767      * When (a || b) == 0, there's no need to check for under/over flow,
1768      * since we know the addend is (normal || 0) and the product is 0.
1769      */
1770     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1771         union_float64 up;
1772         bool prod_sign;
1773 
1774         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1775         prod_sign ^= !!(flags & float_muladd_negate_product);
1776         up.s = float64_set_sign(float64_zero, prod_sign);
1777 
1778         if (flags & float_muladd_negate_c) {
1779             uc.h = -uc.h;
1780         }
1781         ur.h = up.h + uc.h;
1782     } else {
1783         union_float64 ua_orig = ua;
1784         union_float64 uc_orig = uc;
1785 
1786         if (flags & float_muladd_negate_product) {
1787             ua.h = -ua.h;
1788         }
1789         if (flags & float_muladd_negate_c) {
1790             uc.h = -uc.h;
1791         }
1792 
1793         ur.h = fma(ua.h, ub.h, uc.h);
1794 
1795         if (unlikely(f64_is_inf(ur))) {
1796             float_raise(float_flag_overflow, s);
1797         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1798             ua = ua_orig;
1799             uc = uc_orig;
1800             goto soft;
1801         }
1802     }
1803     if (flags & float_muladd_negate_result) {
1804         return float64_chs(ur.s);
1805     }
1806     return ur.s;
1807 
1808  soft:
1809     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1810 }
1811 
1812 /*
1813  * Returns the result of multiplying the bfloat16 values `a'
1814  * and `b' then adding 'c', with no intermediate rounding step after the
1815  * multiplication.
1816  */
1817 
1818 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1819                                       int flags, float_status *status)
1820 {
1821     FloatParts64 pa, pb, pc, pr;
1822 
1823     bfloat16_unpack_canonical(&pa, a, status);
1824     bfloat16_unpack_canonical(&pb, b, status);
1825     bfloat16_unpack_canonical(&pc, c, status);
1826     pr = muladd_floats(pa, pb, pc, flags, status);
1827 
1828     return bfloat16_round_pack_canonical(&pr, status);
1829 }
1830 
1831 /*
1832  * Returns the result of dividing the floating-point value `a' by the
1833  * corresponding value `b'. The operation is performed according to
1834  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1835  */
1836 
1837 static FloatParts64 div_floats(FloatParts64 a, FloatParts64 b, float_status *s)
1838 {
1839     bool sign = a.sign ^ b.sign;
1840 
1841     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1842         uint64_t n0, n1, q, r;
1843         int exp = a.exp - b.exp;
1844 
1845         /*
1846          * We want a 2*N / N-bit division to produce exactly an N-bit
1847          * result, so that we do not lose any precision and so that we
1848          * do not have to renormalize afterward.  If A.frac < B.frac,
1849          * then division would produce an (N-1)-bit result; shift A left
1850          * by one to produce the an N-bit result, and decrement the
1851          * exponent to match.
1852          *
1853          * The udiv_qrnnd algorithm that we're using requires normalization,
1854          * i.e. the msb of the denominator must be set, which is already true.
1855          */
1856         if (a.frac < b.frac) {
1857             exp -= 1;
1858             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1859         } else {
1860             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT, &n1, &n0);
1861         }
1862         q = udiv_qrnnd(&r, n1, n0, b.frac);
1863 
1864         /* Set lsb if there is a remainder, to set inexact. */
1865         a.frac = q | (r != 0);
1866         a.sign = sign;
1867         a.exp = exp;
1868         return a;
1869     }
1870     /* handle all the NaN cases */
1871     if (is_nan(a.cls) || is_nan(b.cls)) {
1872         return *parts_pick_nan(&a, &b, s);
1873     }
1874     /* 0/0 or Inf/Inf */
1875     if (a.cls == b.cls
1876         &&
1877         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1878         float_raise(float_flag_invalid, s);
1879         parts_default_nan(&a, s);
1880         return a;
1881     }
1882     /* Inf / x or 0 / x */
1883     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1884         a.sign = sign;
1885         return a;
1886     }
1887     /* Div 0 => Inf */
1888     if (b.cls == float_class_zero) {
1889         float_raise(float_flag_divbyzero, s);
1890         a.cls = float_class_inf;
1891         a.sign = sign;
1892         return a;
1893     }
1894     /* Div by Inf */
1895     if (b.cls == float_class_inf) {
1896         a.cls = float_class_zero;
1897         a.sign = sign;
1898         return a;
1899     }
1900     g_assert_not_reached();
1901 }
1902 
1903 float16 float16_div(float16 a, float16 b, float_status *status)
1904 {
1905     FloatParts64 pa, pb, pr;
1906 
1907     float16_unpack_canonical(&pa, a, status);
1908     float16_unpack_canonical(&pb, b, status);
1909     pr = div_floats(pa, pb, status);
1910 
1911     return float16_round_pack_canonical(&pr, status);
1912 }
1913 
1914 static float32 QEMU_SOFTFLOAT_ATTR
1915 soft_f32_div(float32 a, float32 b, float_status *status)
1916 {
1917     FloatParts64 pa, pb, pr;
1918 
1919     float32_unpack_canonical(&pa, a, status);
1920     float32_unpack_canonical(&pb, b, status);
1921     pr = div_floats(pa, pb, status);
1922 
1923     return float32_round_pack_canonical(&pr, status);
1924 }
1925 
1926 static float64 QEMU_SOFTFLOAT_ATTR
1927 soft_f64_div(float64 a, float64 b, float_status *status)
1928 {
1929     FloatParts64 pa, pb, pr;
1930 
1931     float64_unpack_canonical(&pa, a, status);
1932     float64_unpack_canonical(&pb, b, status);
1933     pr = div_floats(pa, pb, status);
1934 
1935     return float64_round_pack_canonical(&pr, status);
1936 }
1937 
1938 static float hard_f32_div(float a, float b)
1939 {
1940     return a / b;
1941 }
1942 
1943 static double hard_f64_div(double a, double b)
1944 {
1945     return a / b;
1946 }
1947 
1948 static bool f32_div_pre(union_float32 a, union_float32 b)
1949 {
1950     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1951         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1952                fpclassify(b.h) == FP_NORMAL;
1953     }
1954     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1955 }
1956 
1957 static bool f64_div_pre(union_float64 a, union_float64 b)
1958 {
1959     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1960         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1961                fpclassify(b.h) == FP_NORMAL;
1962     }
1963     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1964 }
1965 
1966 static bool f32_div_post(union_float32 a, union_float32 b)
1967 {
1968     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1969         return fpclassify(a.h) != FP_ZERO;
1970     }
1971     return !float32_is_zero(a.s);
1972 }
1973 
1974 static bool f64_div_post(union_float64 a, union_float64 b)
1975 {
1976     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1977         return fpclassify(a.h) != FP_ZERO;
1978     }
1979     return !float64_is_zero(a.s);
1980 }
1981 
1982 float32 QEMU_FLATTEN
1983 float32_div(float32 a, float32 b, float_status *s)
1984 {
1985     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1986                         f32_div_pre, f32_div_post);
1987 }
1988 
1989 float64 QEMU_FLATTEN
1990 float64_div(float64 a, float64 b, float_status *s)
1991 {
1992     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1993                         f64_div_pre, f64_div_post);
1994 }
1995 
1996 /*
1997  * Returns the result of dividing the bfloat16
1998  * value `a' by the corresponding value `b'.
1999  */
2000 
2001 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
2002 {
2003     FloatParts64 pa, pb, pr;
2004 
2005     bfloat16_unpack_canonical(&pa, a, status);
2006     bfloat16_unpack_canonical(&pb, b, status);
2007     pr = div_floats(pa, pb, status);
2008 
2009     return bfloat16_round_pack_canonical(&pr, status);
2010 }
2011 
2012 /*
2013  * Float to Float conversions
2014  *
2015  * Returns the result of converting one float format to another. The
2016  * conversion is performed according to the IEC/IEEE Standard for
2017  * Binary Floating-Point Arithmetic.
2018  *
2019  * The float_to_float helper only needs to take care of raising
2020  * invalid exceptions and handling the conversion on NaNs.
2021  */
2022 
2023 static FloatParts64 float_to_float(FloatParts64 a, const FloatFmt *dstf,
2024                                  float_status *s)
2025 {
2026     if (dstf->arm_althp) {
2027         switch (a.cls) {
2028         case float_class_qnan:
2029         case float_class_snan:
2030             /* There is no NaN in the destination format.  Raise Invalid
2031              * and return a zero with the sign of the input NaN.
2032              */
2033             float_raise(float_flag_invalid, s);
2034             a.cls = float_class_zero;
2035             a.frac = 0;
2036             a.exp = 0;
2037             break;
2038 
2039         case float_class_inf:
2040             /* There is no Inf in the destination format.  Raise Invalid
2041              * and return the maximum normal with the correct sign.
2042              */
2043             float_raise(float_flag_invalid, s);
2044             a.cls = float_class_normal;
2045             a.exp = dstf->exp_max;
2046             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
2047             break;
2048 
2049         default:
2050             break;
2051         }
2052     } else if (is_nan(a.cls)) {
2053         parts_return_nan(&a, s);
2054     }
2055     return a;
2056 }
2057 
2058 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
2059 {
2060     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2061     FloatParts64 pa, pr;
2062 
2063     float16a_unpack_canonical(&pa, a, s, fmt16);
2064     pr = float_to_float(pa, &float32_params, s);
2065     return float32_round_pack_canonical(&pr, s);
2066 }
2067 
2068 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
2069 {
2070     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2071     FloatParts64 pa, pr;
2072 
2073     float16a_unpack_canonical(&pa, a, s, fmt16);
2074     pr = float_to_float(pa, &float64_params, s);
2075     return float64_round_pack_canonical(&pr, s);
2076 }
2077 
2078 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
2079 {
2080     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2081     FloatParts64 pa, pr;
2082 
2083     float32_unpack_canonical(&pa, a, s);
2084     pr = float_to_float(pa, fmt16, s);
2085     return float16a_round_pack_canonical(&pr, s, fmt16);
2086 }
2087 
2088 static float64 QEMU_SOFTFLOAT_ATTR
2089 soft_float32_to_float64(float32 a, float_status *s)
2090 {
2091     FloatParts64 pa, pr;
2092 
2093     float32_unpack_canonical(&pa, a, s);
2094     pr = float_to_float(pa, &float64_params, s);
2095     return float64_round_pack_canonical(&pr, s);
2096 }
2097 
2098 float64 float32_to_float64(float32 a, float_status *s)
2099 {
2100     if (likely(float32_is_normal(a))) {
2101         /* Widening conversion can never produce inexact results.  */
2102         union_float32 uf;
2103         union_float64 ud;
2104         uf.s = a;
2105         ud.h = uf.h;
2106         return ud.s;
2107     } else if (float32_is_zero(a)) {
2108         return float64_set_sign(float64_zero, float32_is_neg(a));
2109     } else {
2110         return soft_float32_to_float64(a, s);
2111     }
2112 }
2113 
2114 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2115 {
2116     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2117     FloatParts64 pa, pr;
2118 
2119     float64_unpack_canonical(&pa, a, s);
2120     pr = float_to_float(pa, fmt16, s);
2121     return float16a_round_pack_canonical(&pr, s, fmt16);
2122 }
2123 
2124 float32 float64_to_float32(float64 a, float_status *s)
2125 {
2126     FloatParts64 pa, pr;
2127 
2128     float64_unpack_canonical(&pa, a, s);
2129     pr = float_to_float(pa, &float32_params, s);
2130     return float32_round_pack_canonical(&pr, s);
2131 }
2132 
2133 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2134 {
2135     FloatParts64 pa, pr;
2136 
2137     bfloat16_unpack_canonical(&pa, a, s);
2138     pr = float_to_float(pa, &float32_params, s);
2139     return float32_round_pack_canonical(&pr, s);
2140 }
2141 
2142 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2143 {
2144     FloatParts64 pa, pr;
2145 
2146     bfloat16_unpack_canonical(&pa, a, s);
2147     pr = float_to_float(pa, &float64_params, s);
2148     return float64_round_pack_canonical(&pr, s);
2149 }
2150 
2151 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2152 {
2153     FloatParts64 pa, pr;
2154 
2155     float32_unpack_canonical(&pa, a, s);
2156     pr = float_to_float(pa, &bfloat16_params, s);
2157     return bfloat16_round_pack_canonical(&pr, s);
2158 }
2159 
2160 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2161 {
2162     FloatParts64 pa, pr;
2163 
2164     float64_unpack_canonical(&pa, a, s);
2165     pr = float_to_float(pa, &bfloat16_params, s);
2166     return bfloat16_round_pack_canonical(&pr, s);
2167 }
2168 
2169 /*
2170  * Rounds the floating-point value `a' to an integer, and returns the
2171  * result as a floating-point value. The operation is performed
2172  * according to the IEC/IEEE Standard for Binary Floating-Point
2173  * Arithmetic.
2174  */
2175 
2176 static FloatParts64 round_to_int(FloatParts64 a, FloatRoundMode rmode,
2177                                int scale, float_status *s)
2178 {
2179     switch (a.cls) {
2180     case float_class_qnan:
2181     case float_class_snan:
2182         parts_return_nan(&a, s);
2183         break;
2184 
2185     case float_class_zero:
2186     case float_class_inf:
2187         /* already "integral" */
2188         break;
2189 
2190     case float_class_normal:
2191         scale = MIN(MAX(scale, -0x10000), 0x10000);
2192         a.exp += scale;
2193 
2194         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2195             /* already integral */
2196             break;
2197         }
2198         if (a.exp < 0) {
2199             bool one;
2200             /* all fractional */
2201             float_raise(float_flag_inexact, s);
2202             switch (rmode) {
2203             case float_round_nearest_even:
2204                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2205                 break;
2206             case float_round_ties_away:
2207                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2208                 break;
2209             case float_round_to_zero:
2210                 one = false;
2211                 break;
2212             case float_round_up:
2213                 one = !a.sign;
2214                 break;
2215             case float_round_down:
2216                 one = a.sign;
2217                 break;
2218             case float_round_to_odd:
2219                 one = true;
2220                 break;
2221             default:
2222                 g_assert_not_reached();
2223             }
2224 
2225             if (one) {
2226                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2227                 a.exp = 0;
2228             } else {
2229                 a.cls = float_class_zero;
2230             }
2231         } else {
2232             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2233             uint64_t frac_lsbm1 = frac_lsb >> 1;
2234             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2235             uint64_t rnd_mask = rnd_even_mask >> 1;
2236             uint64_t inc;
2237 
2238             switch (rmode) {
2239             case float_round_nearest_even:
2240                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2241                 break;
2242             case float_round_ties_away:
2243                 inc = frac_lsbm1;
2244                 break;
2245             case float_round_to_zero:
2246                 inc = 0;
2247                 break;
2248             case float_round_up:
2249                 inc = a.sign ? 0 : rnd_mask;
2250                 break;
2251             case float_round_down:
2252                 inc = a.sign ? rnd_mask : 0;
2253                 break;
2254             case float_round_to_odd:
2255                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2256                 break;
2257             default:
2258                 g_assert_not_reached();
2259             }
2260 
2261             if (a.frac & rnd_mask) {
2262                 float_raise(float_flag_inexact, s);
2263                 if (uadd64_overflow(a.frac, inc, &a.frac)) {
2264                     a.frac >>= 1;
2265                     a.frac |= DECOMPOSED_IMPLICIT_BIT;
2266                     a.exp++;
2267                 }
2268                 a.frac &= ~rnd_mask;
2269             }
2270         }
2271         break;
2272     default:
2273         g_assert_not_reached();
2274     }
2275     return a;
2276 }
2277 
2278 float16 float16_round_to_int(float16 a, float_status *s)
2279 {
2280     FloatParts64 pa, pr;
2281 
2282     float16_unpack_canonical(&pa, a, s);
2283     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2284     return float16_round_pack_canonical(&pr, s);
2285 }
2286 
2287 float32 float32_round_to_int(float32 a, float_status *s)
2288 {
2289     FloatParts64 pa, pr;
2290 
2291     float32_unpack_canonical(&pa, a, s);
2292     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2293     return float32_round_pack_canonical(&pr, s);
2294 }
2295 
2296 float64 float64_round_to_int(float64 a, float_status *s)
2297 {
2298     FloatParts64 pa, pr;
2299 
2300     float64_unpack_canonical(&pa, a, s);
2301     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2302     return float64_round_pack_canonical(&pr, s);
2303 }
2304 
2305 /*
2306  * Rounds the bfloat16 value `a' to an integer, and returns the
2307  * result as a bfloat16 value.
2308  */
2309 
2310 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2311 {
2312     FloatParts64 pa, pr;
2313 
2314     bfloat16_unpack_canonical(&pa, a, s);
2315     pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2316     return bfloat16_round_pack_canonical(&pr, s);
2317 }
2318 
2319 /*
2320  * Returns the result of converting the floating-point value `a' to
2321  * the two's complement integer format. The conversion is performed
2322  * according to the IEC/IEEE Standard for Binary Floating-Point
2323  * Arithmetic---which means in particular that the conversion is
2324  * rounded according to the current rounding mode. If `a' is a NaN,
2325  * the largest positive integer is returned. Otherwise, if the
2326  * conversion overflows, the largest integer with the same sign as `a'
2327  * is returned.
2328 */
2329 
2330 static int64_t round_to_int_and_pack(FloatParts64 in, FloatRoundMode rmode,
2331                                      int scale, int64_t min, int64_t max,
2332                                      float_status *s)
2333 {
2334     uint64_t r;
2335     int orig_flags = get_float_exception_flags(s);
2336     FloatParts64 p = round_to_int(in, rmode, scale, s);
2337 
2338     switch (p.cls) {
2339     case float_class_snan:
2340     case float_class_qnan:
2341         s->float_exception_flags = orig_flags | float_flag_invalid;
2342         return max;
2343     case float_class_inf:
2344         s->float_exception_flags = orig_flags | float_flag_invalid;
2345         return p.sign ? min : max;
2346     case float_class_zero:
2347         return 0;
2348     case float_class_normal:
2349         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2350             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2351         } else {
2352             r = UINT64_MAX;
2353         }
2354         if (p.sign) {
2355             if (r <= -(uint64_t) min) {
2356                 return -r;
2357             } else {
2358                 s->float_exception_flags = orig_flags | float_flag_invalid;
2359                 return min;
2360             }
2361         } else {
2362             if (r <= max) {
2363                 return r;
2364             } else {
2365                 s->float_exception_flags = orig_flags | float_flag_invalid;
2366                 return max;
2367             }
2368         }
2369     default:
2370         g_assert_not_reached();
2371     }
2372 }
2373 
2374 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2375                               float_status *s)
2376 {
2377     FloatParts64 p;
2378 
2379     float16_unpack_canonical(&p, a, s);
2380     return round_to_int_and_pack(p, rmode, scale, INT8_MIN, INT8_MAX, s);
2381 }
2382 
2383 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2384                                 float_status *s)
2385 {
2386     FloatParts64 p;
2387 
2388     float16_unpack_canonical(&p, a, s);
2389     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2390 }
2391 
2392 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2393                                 float_status *s)
2394 {
2395     FloatParts64 p;
2396 
2397     float16_unpack_canonical(&p, a, s);
2398     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2399 }
2400 
2401 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2402                                 float_status *s)
2403 {
2404     FloatParts64 p;
2405 
2406     float16_unpack_canonical(&p, a, s);
2407     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2408 }
2409 
2410 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2411                                 float_status *s)
2412 {
2413     FloatParts64 p;
2414 
2415     float32_unpack_canonical(&p, a, s);
2416     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2417 }
2418 
2419 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2420                                 float_status *s)
2421 {
2422     FloatParts64 p;
2423 
2424     float32_unpack_canonical(&p, a, s);
2425     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2426 }
2427 
2428 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2429                                 float_status *s)
2430 {
2431     FloatParts64 p;
2432 
2433     float32_unpack_canonical(&p, a, s);
2434     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2435 }
2436 
2437 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2438                                 float_status *s)
2439 {
2440     FloatParts64 p;
2441 
2442     float64_unpack_canonical(&p, a, s);
2443     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2444 }
2445 
2446 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2447                                 float_status *s)
2448 {
2449     FloatParts64 p;
2450 
2451     float64_unpack_canonical(&p, a, s);
2452     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2453 }
2454 
2455 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2456                                 float_status *s)
2457 {
2458     FloatParts64 p;
2459 
2460     float64_unpack_canonical(&p, a, s);
2461     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2462 }
2463 
2464 int8_t float16_to_int8(float16 a, float_status *s)
2465 {
2466     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2467 }
2468 
2469 int16_t float16_to_int16(float16 a, float_status *s)
2470 {
2471     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2472 }
2473 
2474 int32_t float16_to_int32(float16 a, float_status *s)
2475 {
2476     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2477 }
2478 
2479 int64_t float16_to_int64(float16 a, float_status *s)
2480 {
2481     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2482 }
2483 
2484 int16_t float32_to_int16(float32 a, float_status *s)
2485 {
2486     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2487 }
2488 
2489 int32_t float32_to_int32(float32 a, float_status *s)
2490 {
2491     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2492 }
2493 
2494 int64_t float32_to_int64(float32 a, float_status *s)
2495 {
2496     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2497 }
2498 
2499 int16_t float64_to_int16(float64 a, float_status *s)
2500 {
2501     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2502 }
2503 
2504 int32_t float64_to_int32(float64 a, float_status *s)
2505 {
2506     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2507 }
2508 
2509 int64_t float64_to_int64(float64 a, float_status *s)
2510 {
2511     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2512 }
2513 
2514 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2515 {
2516     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2517 }
2518 
2519 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2520 {
2521     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2522 }
2523 
2524 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2525 {
2526     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2527 }
2528 
2529 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2530 {
2531     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2532 }
2533 
2534 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2535 {
2536     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2537 }
2538 
2539 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2540 {
2541     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2542 }
2543 
2544 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2545 {
2546     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2547 }
2548 
2549 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2550 {
2551     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2552 }
2553 
2554 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2555 {
2556     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2557 }
2558 
2559 /*
2560  * Returns the result of converting the floating-point value `a' to
2561  * the two's complement integer format.
2562  */
2563 
2564 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2565                                  float_status *s)
2566 {
2567     FloatParts64 p;
2568 
2569     bfloat16_unpack_canonical(&p, a, s);
2570     return round_to_int_and_pack(p, rmode, scale, INT16_MIN, INT16_MAX, s);
2571 }
2572 
2573 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2574                                  float_status *s)
2575 {
2576     FloatParts64 p;
2577 
2578     bfloat16_unpack_canonical(&p, a, s);
2579     return round_to_int_and_pack(p, rmode, scale, INT32_MIN, INT32_MAX, s);
2580 }
2581 
2582 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2583                                  float_status *s)
2584 {
2585     FloatParts64 p;
2586 
2587     bfloat16_unpack_canonical(&p, a, s);
2588     return round_to_int_and_pack(p, rmode, scale, INT64_MIN, INT64_MAX, s);
2589 }
2590 
2591 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2592 {
2593     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2594 }
2595 
2596 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2597 {
2598     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2599 }
2600 
2601 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2602 {
2603     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2604 }
2605 
2606 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2607 {
2608     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2609 }
2610 
2611 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2612 {
2613     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2614 }
2615 
2616 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2617 {
2618     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2619 }
2620 
2621 /*
2622  *  Returns the result of converting the floating-point value `a' to
2623  *  the unsigned integer format. The conversion is performed according
2624  *  to the IEC/IEEE Standard for Binary Floating-Point
2625  *  Arithmetic---which means in particular that the conversion is
2626  *  rounded according to the current rounding mode. If `a' is a NaN,
2627  *  the largest unsigned integer is returned. Otherwise, if the
2628  *  conversion overflows, the largest unsigned integer is returned. If
2629  *  the 'a' is negative, the result is rounded and zero is returned;
2630  *  values that do not round to zero will raise the inexact exception
2631  *  flag.
2632  */
2633 
2634 static uint64_t round_to_uint_and_pack(FloatParts64 in, FloatRoundMode rmode,
2635                                        int scale, uint64_t max,
2636                                        float_status *s)
2637 {
2638     int orig_flags = get_float_exception_flags(s);
2639     FloatParts64 p = round_to_int(in, rmode, scale, s);
2640     uint64_t r;
2641 
2642     switch (p.cls) {
2643     case float_class_snan:
2644     case float_class_qnan:
2645         s->float_exception_flags = orig_flags | float_flag_invalid;
2646         return max;
2647     case float_class_inf:
2648         s->float_exception_flags = orig_flags | float_flag_invalid;
2649         return p.sign ? 0 : max;
2650     case float_class_zero:
2651         return 0;
2652     case float_class_normal:
2653         if (p.sign) {
2654             s->float_exception_flags = orig_flags | float_flag_invalid;
2655             return 0;
2656         }
2657 
2658         if (p.exp <= DECOMPOSED_BINARY_POINT) {
2659             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2660         } else {
2661             s->float_exception_flags = orig_flags | float_flag_invalid;
2662             return max;
2663         }
2664 
2665         /* For uint64 this will never trip, but if p.exp is too large
2666          * to shift a decomposed fraction we shall have exited via the
2667          * 3rd leg above.
2668          */
2669         if (r > max) {
2670             s->float_exception_flags = orig_flags | float_flag_invalid;
2671             return max;
2672         }
2673         return r;
2674     default:
2675         g_assert_not_reached();
2676     }
2677 }
2678 
2679 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2680                                 float_status *s)
2681 {
2682     FloatParts64 p;
2683 
2684     float16_unpack_canonical(&p, a, s);
2685     return round_to_uint_and_pack(p, rmode, scale, UINT8_MAX, s);
2686 }
2687 
2688 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2689                                   float_status *s)
2690 {
2691     FloatParts64 p;
2692 
2693     float16_unpack_canonical(&p, a, s);
2694     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2695 }
2696 
2697 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2698                                   float_status *s)
2699 {
2700     FloatParts64 p;
2701 
2702     float16_unpack_canonical(&p, a, s);
2703     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2704 }
2705 
2706 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2707                                   float_status *s)
2708 {
2709     FloatParts64 p;
2710 
2711     float16_unpack_canonical(&p, a, s);
2712     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2713 }
2714 
2715 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2716                                   float_status *s)
2717 {
2718     FloatParts64 p;
2719 
2720     float32_unpack_canonical(&p, a, s);
2721     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2722 }
2723 
2724 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2725                                   float_status *s)
2726 {
2727     FloatParts64 p;
2728 
2729     float32_unpack_canonical(&p, a, s);
2730     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2731 }
2732 
2733 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2734                                   float_status *s)
2735 {
2736     FloatParts64 p;
2737 
2738     float32_unpack_canonical(&p, a, s);
2739     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2740 }
2741 
2742 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2743                                   float_status *s)
2744 {
2745     FloatParts64 p;
2746 
2747     float64_unpack_canonical(&p, a, s);
2748     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2749 }
2750 
2751 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2752                                   float_status *s)
2753 {
2754     FloatParts64 p;
2755 
2756     float64_unpack_canonical(&p, a, s);
2757     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2758 }
2759 
2760 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2761                                   float_status *s)
2762 {
2763     FloatParts64 p;
2764 
2765     float64_unpack_canonical(&p, a, s);
2766     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2767 }
2768 
2769 uint8_t float16_to_uint8(float16 a, float_status *s)
2770 {
2771     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2772 }
2773 
2774 uint16_t float16_to_uint16(float16 a, float_status *s)
2775 {
2776     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2777 }
2778 
2779 uint32_t float16_to_uint32(float16 a, float_status *s)
2780 {
2781     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2782 }
2783 
2784 uint64_t float16_to_uint64(float16 a, float_status *s)
2785 {
2786     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2787 }
2788 
2789 uint16_t float32_to_uint16(float32 a, float_status *s)
2790 {
2791     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2792 }
2793 
2794 uint32_t float32_to_uint32(float32 a, float_status *s)
2795 {
2796     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2797 }
2798 
2799 uint64_t float32_to_uint64(float32 a, float_status *s)
2800 {
2801     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2802 }
2803 
2804 uint16_t float64_to_uint16(float64 a, float_status *s)
2805 {
2806     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2807 }
2808 
2809 uint32_t float64_to_uint32(float64 a, float_status *s)
2810 {
2811     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2812 }
2813 
2814 uint64_t float64_to_uint64(float64 a, float_status *s)
2815 {
2816     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2817 }
2818 
2819 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2820 {
2821     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2822 }
2823 
2824 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2825 {
2826     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2827 }
2828 
2829 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2830 {
2831     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2832 }
2833 
2834 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2835 {
2836     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2837 }
2838 
2839 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2840 {
2841     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2842 }
2843 
2844 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2845 {
2846     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2847 }
2848 
2849 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2850 {
2851     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2852 }
2853 
2854 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2855 {
2856     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2857 }
2858 
2859 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2860 {
2861     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2862 }
2863 
2864 /*
2865  *  Returns the result of converting the bfloat16 value `a' to
2866  *  the unsigned integer format.
2867  */
2868 
2869 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2870                                    int scale, float_status *s)
2871 {
2872     FloatParts64 p;
2873 
2874     bfloat16_unpack_canonical(&p, a, s);
2875     return round_to_uint_and_pack(p, rmode, scale, UINT16_MAX, s);
2876 }
2877 
2878 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2879                                    int scale, float_status *s)
2880 {
2881     FloatParts64 p;
2882 
2883     bfloat16_unpack_canonical(&p, a, s);
2884     return round_to_uint_and_pack(p, rmode, scale, UINT32_MAX, s);
2885 }
2886 
2887 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2888                                    int scale, float_status *s)
2889 {
2890     FloatParts64 p;
2891 
2892     bfloat16_unpack_canonical(&p, a, s);
2893     return round_to_uint_and_pack(p, rmode, scale, UINT64_MAX, s);
2894 }
2895 
2896 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2897 {
2898     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2899 }
2900 
2901 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2902 {
2903     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2904 }
2905 
2906 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2907 {
2908     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2909 }
2910 
2911 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2912 {
2913     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2914 }
2915 
2916 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2917 {
2918     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2919 }
2920 
2921 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2922 {
2923     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2924 }
2925 
2926 /*
2927  * Integer to float conversions
2928  *
2929  * Returns the result of converting the two's complement integer `a'
2930  * to the floating-point format. The conversion is performed according
2931  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2932  */
2933 
2934 static FloatParts64 int_to_float(int64_t a, int scale, float_status *status)
2935 {
2936     FloatParts64 r = { .sign = false };
2937 
2938     if (a == 0) {
2939         r.cls = float_class_zero;
2940     } else {
2941         uint64_t f = a;
2942         int shift;
2943 
2944         r.cls = float_class_normal;
2945         if (a < 0) {
2946             f = -f;
2947             r.sign = true;
2948         }
2949         shift = clz64(f);
2950         scale = MIN(MAX(scale, -0x10000), 0x10000);
2951 
2952         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2953         r.frac = f << shift;
2954     }
2955 
2956     return r;
2957 }
2958 
2959 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2960 {
2961     FloatParts64 pa = int_to_float(a, scale, status);
2962     return float16_round_pack_canonical(&pa, status);
2963 }
2964 
2965 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2966 {
2967     return int64_to_float16_scalbn(a, scale, status);
2968 }
2969 
2970 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2971 {
2972     return int64_to_float16_scalbn(a, scale, status);
2973 }
2974 
2975 float16 int64_to_float16(int64_t a, float_status *status)
2976 {
2977     return int64_to_float16_scalbn(a, 0, status);
2978 }
2979 
2980 float16 int32_to_float16(int32_t a, float_status *status)
2981 {
2982     return int64_to_float16_scalbn(a, 0, status);
2983 }
2984 
2985 float16 int16_to_float16(int16_t a, float_status *status)
2986 {
2987     return int64_to_float16_scalbn(a, 0, status);
2988 }
2989 
2990 float16 int8_to_float16(int8_t a, float_status *status)
2991 {
2992     return int64_to_float16_scalbn(a, 0, status);
2993 }
2994 
2995 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2996 {
2997     FloatParts64 pa = int_to_float(a, scale, status);
2998     return float32_round_pack_canonical(&pa, status);
2999 }
3000 
3001 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
3002 {
3003     return int64_to_float32_scalbn(a, scale, status);
3004 }
3005 
3006 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
3007 {
3008     return int64_to_float32_scalbn(a, scale, status);
3009 }
3010 
3011 float32 int64_to_float32(int64_t a, float_status *status)
3012 {
3013     return int64_to_float32_scalbn(a, 0, status);
3014 }
3015 
3016 float32 int32_to_float32(int32_t a, float_status *status)
3017 {
3018     return int64_to_float32_scalbn(a, 0, status);
3019 }
3020 
3021 float32 int16_to_float32(int16_t a, float_status *status)
3022 {
3023     return int64_to_float32_scalbn(a, 0, status);
3024 }
3025 
3026 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
3027 {
3028     FloatParts64 pa = int_to_float(a, scale, status);
3029     return float64_round_pack_canonical(&pa, status);
3030 }
3031 
3032 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
3033 {
3034     return int64_to_float64_scalbn(a, scale, status);
3035 }
3036 
3037 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
3038 {
3039     return int64_to_float64_scalbn(a, scale, status);
3040 }
3041 
3042 float64 int64_to_float64(int64_t a, float_status *status)
3043 {
3044     return int64_to_float64_scalbn(a, 0, status);
3045 }
3046 
3047 float64 int32_to_float64(int32_t a, float_status *status)
3048 {
3049     return int64_to_float64_scalbn(a, 0, status);
3050 }
3051 
3052 float64 int16_to_float64(int16_t a, float_status *status)
3053 {
3054     return int64_to_float64_scalbn(a, 0, status);
3055 }
3056 
3057 /*
3058  * Returns the result of converting the two's complement integer `a'
3059  * to the bfloat16 format.
3060  */
3061 
3062 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
3063 {
3064     FloatParts64 pa = int_to_float(a, scale, status);
3065     return bfloat16_round_pack_canonical(&pa, status);
3066 }
3067 
3068 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
3069 {
3070     return int64_to_bfloat16_scalbn(a, scale, status);
3071 }
3072 
3073 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
3074 {
3075     return int64_to_bfloat16_scalbn(a, scale, status);
3076 }
3077 
3078 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
3079 {
3080     return int64_to_bfloat16_scalbn(a, 0, status);
3081 }
3082 
3083 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
3084 {
3085     return int64_to_bfloat16_scalbn(a, 0, status);
3086 }
3087 
3088 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
3089 {
3090     return int64_to_bfloat16_scalbn(a, 0, status);
3091 }
3092 
3093 /*
3094  * Unsigned Integer to float conversions
3095  *
3096  * Returns the result of converting the unsigned integer `a' to the
3097  * floating-point format. The conversion is performed according to the
3098  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3099  */
3100 
3101 static FloatParts64 uint_to_float(uint64_t a, int scale, float_status *status)
3102 {
3103     FloatParts64 r = { .sign = false };
3104     int shift;
3105 
3106     if (a == 0) {
3107         r.cls = float_class_zero;
3108     } else {
3109         scale = MIN(MAX(scale, -0x10000), 0x10000);
3110         shift = clz64(a);
3111         r.cls = float_class_normal;
3112         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
3113         r.frac = a << shift;
3114     }
3115 
3116     return r;
3117 }
3118 
3119 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
3120 {
3121     FloatParts64 pa = uint_to_float(a, scale, status);
3122     return float16_round_pack_canonical(&pa, status);
3123 }
3124 
3125 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
3126 {
3127     return uint64_to_float16_scalbn(a, scale, status);
3128 }
3129 
3130 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
3131 {
3132     return uint64_to_float16_scalbn(a, scale, status);
3133 }
3134 
3135 float16 uint64_to_float16(uint64_t a, float_status *status)
3136 {
3137     return uint64_to_float16_scalbn(a, 0, status);
3138 }
3139 
3140 float16 uint32_to_float16(uint32_t a, float_status *status)
3141 {
3142     return uint64_to_float16_scalbn(a, 0, status);
3143 }
3144 
3145 float16 uint16_to_float16(uint16_t a, float_status *status)
3146 {
3147     return uint64_to_float16_scalbn(a, 0, status);
3148 }
3149 
3150 float16 uint8_to_float16(uint8_t a, float_status *status)
3151 {
3152     return uint64_to_float16_scalbn(a, 0, status);
3153 }
3154 
3155 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
3156 {
3157     FloatParts64 pa = uint_to_float(a, scale, status);
3158     return float32_round_pack_canonical(&pa, status);
3159 }
3160 
3161 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
3162 {
3163     return uint64_to_float32_scalbn(a, scale, status);
3164 }
3165 
3166 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
3167 {
3168     return uint64_to_float32_scalbn(a, scale, status);
3169 }
3170 
3171 float32 uint64_to_float32(uint64_t a, float_status *status)
3172 {
3173     return uint64_to_float32_scalbn(a, 0, status);
3174 }
3175 
3176 float32 uint32_to_float32(uint32_t a, float_status *status)
3177 {
3178     return uint64_to_float32_scalbn(a, 0, status);
3179 }
3180 
3181 float32 uint16_to_float32(uint16_t a, float_status *status)
3182 {
3183     return uint64_to_float32_scalbn(a, 0, status);
3184 }
3185 
3186 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3187 {
3188     FloatParts64 pa = uint_to_float(a, scale, status);
3189     return float64_round_pack_canonical(&pa, status);
3190 }
3191 
3192 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3193 {
3194     return uint64_to_float64_scalbn(a, scale, status);
3195 }
3196 
3197 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3198 {
3199     return uint64_to_float64_scalbn(a, scale, status);
3200 }
3201 
3202 float64 uint64_to_float64(uint64_t a, float_status *status)
3203 {
3204     return uint64_to_float64_scalbn(a, 0, status);
3205 }
3206 
3207 float64 uint32_to_float64(uint32_t a, float_status *status)
3208 {
3209     return uint64_to_float64_scalbn(a, 0, status);
3210 }
3211 
3212 float64 uint16_to_float64(uint16_t a, float_status *status)
3213 {
3214     return uint64_to_float64_scalbn(a, 0, status);
3215 }
3216 
3217 /*
3218  * Returns the result of converting the unsigned integer `a' to the
3219  * bfloat16 format.
3220  */
3221 
3222 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3223 {
3224     FloatParts64 pa = uint_to_float(a, scale, status);
3225     return bfloat16_round_pack_canonical(&pa, status);
3226 }
3227 
3228 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3229 {
3230     return uint64_to_bfloat16_scalbn(a, scale, status);
3231 }
3232 
3233 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3234 {
3235     return uint64_to_bfloat16_scalbn(a, scale, status);
3236 }
3237 
3238 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3239 {
3240     return uint64_to_bfloat16_scalbn(a, 0, status);
3241 }
3242 
3243 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3244 {
3245     return uint64_to_bfloat16_scalbn(a, 0, status);
3246 }
3247 
3248 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3249 {
3250     return uint64_to_bfloat16_scalbn(a, 0, status);
3251 }
3252 
3253 /* Float Min/Max */
3254 /* min() and max() functions. These can't be implemented as
3255  * 'compare and pick one input' because that would mishandle
3256  * NaNs and +0 vs -0.
3257  *
3258  * minnum() and maxnum() functions. These are similar to the min()
3259  * and max() functions but if one of the arguments is a QNaN and
3260  * the other is numerical then the numerical argument is returned.
3261  * SNaNs will get quietened before being returned.
3262  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3263  * and maxNum() operations. min() and max() are the typical min/max
3264  * semantics provided by many CPUs which predate that specification.
3265  *
3266  * minnummag() and maxnummag() functions correspond to minNumMag()
3267  * and minNumMag() from the IEEE-754 2008.
3268  */
3269 static FloatParts64 minmax_floats(FloatParts64 a, FloatParts64 b, bool ismin,
3270                                 bool ieee, bool ismag, float_status *s)
3271 {
3272     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3273         if (ieee) {
3274             /* Takes two floating-point values `a' and `b', one of
3275              * which is a NaN, and returns the appropriate NaN
3276              * result. If either `a' or `b' is a signaling NaN,
3277              * the invalid exception is raised.
3278              */
3279             if (is_snan(a.cls) || is_snan(b.cls)) {
3280                 return *parts_pick_nan(&a, &b, s);
3281             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3282                 return b;
3283             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3284                 return a;
3285             }
3286         }
3287         return *parts_pick_nan(&a, &b, s);
3288     } else {
3289         int a_exp, b_exp;
3290 
3291         switch (a.cls) {
3292         case float_class_normal:
3293             a_exp = a.exp;
3294             break;
3295         case float_class_inf:
3296             a_exp = INT_MAX;
3297             break;
3298         case float_class_zero:
3299             a_exp = INT_MIN;
3300             break;
3301         default:
3302             g_assert_not_reached();
3303             break;
3304         }
3305         switch (b.cls) {
3306         case float_class_normal:
3307             b_exp = b.exp;
3308             break;
3309         case float_class_inf:
3310             b_exp = INT_MAX;
3311             break;
3312         case float_class_zero:
3313             b_exp = INT_MIN;
3314             break;
3315         default:
3316             g_assert_not_reached();
3317             break;
3318         }
3319 
3320         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3321             bool a_less = a_exp < b_exp;
3322             if (a_exp == b_exp) {
3323                 a_less = a.frac < b.frac;
3324             }
3325             return a_less ^ ismin ? b : a;
3326         }
3327 
3328         if (a.sign == b.sign) {
3329             bool a_less = a_exp < b_exp;
3330             if (a_exp == b_exp) {
3331                 a_less = a.frac < b.frac;
3332             }
3333             return a.sign ^ a_less ^ ismin ? b : a;
3334         } else {
3335             return a.sign ^ ismin ? b : a;
3336         }
3337     }
3338 }
3339 
3340 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3341 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3342                                      float_status *s)                   \
3343 {                                                                       \
3344     FloatParts64 pa, pb, pr;                                            \
3345     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3346     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3347     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3348     return float ## sz ## _round_pack_canonical(&pr, s);                \
3349 }
3350 
3351 MINMAX(16, min, true, false, false)
3352 MINMAX(16, minnum, true, true, false)
3353 MINMAX(16, minnummag, true, true, true)
3354 MINMAX(16, max, false, false, false)
3355 MINMAX(16, maxnum, false, true, false)
3356 MINMAX(16, maxnummag, false, true, true)
3357 
3358 MINMAX(32, min, true, false, false)
3359 MINMAX(32, minnum, true, true, false)
3360 MINMAX(32, minnummag, true, true, true)
3361 MINMAX(32, max, false, false, false)
3362 MINMAX(32, maxnum, false, true, false)
3363 MINMAX(32, maxnummag, false, true, true)
3364 
3365 MINMAX(64, min, true, false, false)
3366 MINMAX(64, minnum, true, true, false)
3367 MINMAX(64, minnummag, true, true, true)
3368 MINMAX(64, max, false, false, false)
3369 MINMAX(64, maxnum, false, true, false)
3370 MINMAX(64, maxnummag, false, true, true)
3371 
3372 #undef MINMAX
3373 
3374 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3375 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3376 {                                                                       \
3377     FloatParts64 pa, pb, pr;                                            \
3378     bfloat16_unpack_canonical(&pa, a, s);                               \
3379     bfloat16_unpack_canonical(&pb, b, s);                               \
3380     pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);                 \
3381     return bfloat16_round_pack_canonical(&pr, s);                       \
3382 }
3383 
3384 BF16_MINMAX(min, true, false, false)
3385 BF16_MINMAX(minnum, true, true, false)
3386 BF16_MINMAX(minnummag, true, true, true)
3387 BF16_MINMAX(max, false, false, false)
3388 BF16_MINMAX(maxnum, false, true, false)
3389 BF16_MINMAX(maxnummag, false, true, true)
3390 
3391 #undef BF16_MINMAX
3392 
3393 /* Floating point compare */
3394 static FloatRelation compare_floats(FloatParts64 a, FloatParts64 b, bool is_quiet,
3395                                     float_status *s)
3396 {
3397     if (is_nan(a.cls) || is_nan(b.cls)) {
3398         if (!is_quiet ||
3399             a.cls == float_class_snan ||
3400             b.cls == float_class_snan) {
3401             float_raise(float_flag_invalid, s);
3402         }
3403         return float_relation_unordered;
3404     }
3405 
3406     if (a.cls == float_class_zero) {
3407         if (b.cls == float_class_zero) {
3408             return float_relation_equal;
3409         }
3410         return b.sign ? float_relation_greater : float_relation_less;
3411     } else if (b.cls == float_class_zero) {
3412         return a.sign ? float_relation_less : float_relation_greater;
3413     }
3414 
3415     /* The only really important thing about infinity is its sign. If
3416      * both are infinities the sign marks the smallest of the two.
3417      */
3418     if (a.cls == float_class_inf) {
3419         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3420             return float_relation_equal;
3421         }
3422         return a.sign ? float_relation_less : float_relation_greater;
3423     } else if (b.cls == float_class_inf) {
3424         return b.sign ? float_relation_greater : float_relation_less;
3425     }
3426 
3427     if (a.sign != b.sign) {
3428         return a.sign ? float_relation_less : float_relation_greater;
3429     }
3430 
3431     if (a.exp == b.exp) {
3432         if (a.frac == b.frac) {
3433             return float_relation_equal;
3434         }
3435         if (a.sign) {
3436             return a.frac > b.frac ?
3437                 float_relation_less : float_relation_greater;
3438         } else {
3439             return a.frac > b.frac ?
3440                 float_relation_greater : float_relation_less;
3441         }
3442     } else {
3443         if (a.sign) {
3444             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3445         } else {
3446             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3447         }
3448     }
3449 }
3450 
3451 #define COMPARE(name, attr, sz)                                         \
3452 static int attr                                                         \
3453 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3454 {                                                                       \
3455     FloatParts64 pa, pb;                                                \
3456     float ## sz ## _unpack_canonical(&pa, a, s);                        \
3457     float ## sz ## _unpack_canonical(&pb, b, s);                        \
3458     return compare_floats(pa, pb, is_quiet, s);                         \
3459 }
3460 
3461 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3462 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3463 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3464 
3465 #undef COMPARE
3466 
3467 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3468 {
3469     return soft_f16_compare(a, b, false, s);
3470 }
3471 
3472 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3473 {
3474     return soft_f16_compare(a, b, true, s);
3475 }
3476 
3477 static FloatRelation QEMU_FLATTEN
3478 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3479 {
3480     union_float32 ua, ub;
3481 
3482     ua.s = xa;
3483     ub.s = xb;
3484 
3485     if (QEMU_NO_HARDFLOAT) {
3486         goto soft;
3487     }
3488 
3489     float32_input_flush2(&ua.s, &ub.s, s);
3490     if (isgreaterequal(ua.h, ub.h)) {
3491         if (isgreater(ua.h, ub.h)) {
3492             return float_relation_greater;
3493         }
3494         return float_relation_equal;
3495     }
3496     if (likely(isless(ua.h, ub.h))) {
3497         return float_relation_less;
3498     }
3499     /* The only condition remaining is unordered.
3500      * Fall through to set flags.
3501      */
3502  soft:
3503     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3504 }
3505 
3506 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3507 {
3508     return f32_compare(a, b, false, s);
3509 }
3510 
3511 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3512 {
3513     return f32_compare(a, b, true, s);
3514 }
3515 
3516 static FloatRelation QEMU_FLATTEN
3517 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3518 {
3519     union_float64 ua, ub;
3520 
3521     ua.s = xa;
3522     ub.s = xb;
3523 
3524     if (QEMU_NO_HARDFLOAT) {
3525         goto soft;
3526     }
3527 
3528     float64_input_flush2(&ua.s, &ub.s, s);
3529     if (isgreaterequal(ua.h, ub.h)) {
3530         if (isgreater(ua.h, ub.h)) {
3531             return float_relation_greater;
3532         }
3533         return float_relation_equal;
3534     }
3535     if (likely(isless(ua.h, ub.h))) {
3536         return float_relation_less;
3537     }
3538     /* The only condition remaining is unordered.
3539      * Fall through to set flags.
3540      */
3541  soft:
3542     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3543 }
3544 
3545 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3546 {
3547     return f64_compare(a, b, false, s);
3548 }
3549 
3550 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3551 {
3552     return f64_compare(a, b, true, s);
3553 }
3554 
3555 static FloatRelation QEMU_FLATTEN
3556 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3557 {
3558     FloatParts64 pa, pb;
3559 
3560     bfloat16_unpack_canonical(&pa, a, s);
3561     bfloat16_unpack_canonical(&pb, b, s);
3562     return compare_floats(pa, pb, is_quiet, s);
3563 }
3564 
3565 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3566 {
3567     return soft_bf16_compare(a, b, false, s);
3568 }
3569 
3570 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3571 {
3572     return soft_bf16_compare(a, b, true, s);
3573 }
3574 
3575 /* Multiply A by 2 raised to the power N.  */
3576 static FloatParts64 scalbn_decomposed(FloatParts64 a, int n, float_status *s)
3577 {
3578     if (unlikely(is_nan(a.cls))) {
3579         parts_return_nan(&a, s);
3580     }
3581     if (a.cls == float_class_normal) {
3582         /* The largest float type (even though not supported by FloatParts64)
3583          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3584          * still allows rounding to infinity, without allowing overflow
3585          * within the int32_t that backs FloatParts64.exp.
3586          */
3587         n = MIN(MAX(n, -0x10000), 0x10000);
3588         a.exp += n;
3589     }
3590     return a;
3591 }
3592 
3593 float16 float16_scalbn(float16 a, int n, float_status *status)
3594 {
3595     FloatParts64 pa, pr;
3596 
3597     float16_unpack_canonical(&pa, a, status);
3598     pr = scalbn_decomposed(pa, n, status);
3599     return float16_round_pack_canonical(&pr, status);
3600 }
3601 
3602 float32 float32_scalbn(float32 a, int n, float_status *status)
3603 {
3604     FloatParts64 pa, pr;
3605 
3606     float32_unpack_canonical(&pa, a, status);
3607     pr = scalbn_decomposed(pa, n, status);
3608     return float32_round_pack_canonical(&pr, status);
3609 }
3610 
3611 float64 float64_scalbn(float64 a, int n, float_status *status)
3612 {
3613     FloatParts64 pa, pr;
3614 
3615     float64_unpack_canonical(&pa, a, status);
3616     pr = scalbn_decomposed(pa, n, status);
3617     return float64_round_pack_canonical(&pr, status);
3618 }
3619 
3620 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3621 {
3622     FloatParts64 pa, pr;
3623 
3624     bfloat16_unpack_canonical(&pa, a, status);
3625     pr = scalbn_decomposed(pa, n, status);
3626     return bfloat16_round_pack_canonical(&pr, status);
3627 }
3628 
3629 /*
3630  * Square Root
3631  *
3632  * The old softfloat code did an approximation step before zeroing in
3633  * on the final result. However for simpleness we just compute the
3634  * square root by iterating down from the implicit bit to enough extra
3635  * bits to ensure we get a correctly rounded result.
3636  *
3637  * This does mean however the calculation is slower than before,
3638  * especially for 64 bit floats.
3639  */
3640 
3641 static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)
3642 {
3643     uint64_t a_frac, r_frac, s_frac;
3644     int bit, last_bit;
3645 
3646     if (is_nan(a.cls)) {
3647         parts_return_nan(&a, s);
3648         return a;
3649     }
3650     if (a.cls == float_class_zero) {
3651         return a;  /* sqrt(+-0) = +-0 */
3652     }
3653     if (a.sign) {
3654         float_raise(float_flag_invalid, s);
3655         parts_default_nan(&a, s);
3656         return a;
3657     }
3658     if (a.cls == float_class_inf) {
3659         return a;  /* sqrt(+inf) = +inf */
3660     }
3661 
3662     assert(a.cls == float_class_normal);
3663 
3664     /* We need two overflow bits at the top. Adding room for that is a
3665      * right shift. If the exponent is odd, we can discard the low bit
3666      * by multiplying the fraction by 2; that's a left shift. Combine
3667      * those and we shift right by 1 if the exponent is odd, otherwise 2.
3668      */
3669     a_frac = a.frac >> (2 - (a.exp & 1));
3670     a.exp >>= 1;
3671 
3672     /* Bit-by-bit computation of sqrt.  */
3673     r_frac = 0;
3674     s_frac = 0;
3675 
3676     /* Iterate from implicit bit down to the 3 extra bits to compute a
3677      * properly rounded result. Remember we've inserted two more bits
3678      * at the top, so these positions are two less.
3679      */
3680     bit = DECOMPOSED_BINARY_POINT - 2;
3681     last_bit = MAX(p->frac_shift - 4, 0);
3682     do {
3683         uint64_t q = 1ULL << bit;
3684         uint64_t t_frac = s_frac + q;
3685         if (t_frac <= a_frac) {
3686             s_frac = t_frac + q;
3687             a_frac -= t_frac;
3688             r_frac += q;
3689         }
3690         a_frac <<= 1;
3691     } while (--bit >= last_bit);
3692 
3693     /* Undo the right shift done above. If there is any remaining
3694      * fraction, the result is inexact. Set the sticky bit.
3695      */
3696     a.frac = (r_frac << 2) + (a_frac != 0);
3697 
3698     return a;
3699 }
3700 
3701 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3702 {
3703     FloatParts64 pa, pr;
3704 
3705     float16_unpack_canonical(&pa, a, status);
3706     pr = sqrt_float(pa, status, &float16_params);
3707     return float16_round_pack_canonical(&pr, status);
3708 }
3709 
3710 static float32 QEMU_SOFTFLOAT_ATTR
3711 soft_f32_sqrt(float32 a, float_status *status)
3712 {
3713     FloatParts64 pa, pr;
3714 
3715     float32_unpack_canonical(&pa, a, status);
3716     pr = sqrt_float(pa, status, &float32_params);
3717     return float32_round_pack_canonical(&pr, status);
3718 }
3719 
3720 static float64 QEMU_SOFTFLOAT_ATTR
3721 soft_f64_sqrt(float64 a, float_status *status)
3722 {
3723     FloatParts64 pa, pr;
3724 
3725     float64_unpack_canonical(&pa, a, status);
3726     pr = sqrt_float(pa, status, &float64_params);
3727     return float64_round_pack_canonical(&pr, status);
3728 }
3729 
3730 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3731 {
3732     union_float32 ua, ur;
3733 
3734     ua.s = xa;
3735     if (unlikely(!can_use_fpu(s))) {
3736         goto soft;
3737     }
3738 
3739     float32_input_flush1(&ua.s, s);
3740     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3741         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3742                        fpclassify(ua.h) == FP_ZERO) ||
3743                      signbit(ua.h))) {
3744             goto soft;
3745         }
3746     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3747                         float32_is_neg(ua.s))) {
3748         goto soft;
3749     }
3750     ur.h = sqrtf(ua.h);
3751     return ur.s;
3752 
3753  soft:
3754     return soft_f32_sqrt(ua.s, s);
3755 }
3756 
3757 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3758 {
3759     union_float64 ua, ur;
3760 
3761     ua.s = xa;
3762     if (unlikely(!can_use_fpu(s))) {
3763         goto soft;
3764     }
3765 
3766     float64_input_flush1(&ua.s, s);
3767     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3768         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3769                        fpclassify(ua.h) == FP_ZERO) ||
3770                      signbit(ua.h))) {
3771             goto soft;
3772         }
3773     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3774                         float64_is_neg(ua.s))) {
3775         goto soft;
3776     }
3777     ur.h = sqrt(ua.h);
3778     return ur.s;
3779 
3780  soft:
3781     return soft_f64_sqrt(ua.s, s);
3782 }
3783 
3784 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3785 {
3786     FloatParts64 pa, pr;
3787 
3788     bfloat16_unpack_canonical(&pa, a, status);
3789     pr = sqrt_float(pa, status, &bfloat16_params);
3790     return bfloat16_round_pack_canonical(&pr, status);
3791 }
3792 
3793 /*----------------------------------------------------------------------------
3794 | The pattern for a default generated NaN.
3795 *----------------------------------------------------------------------------*/
3796 
3797 float16 float16_default_nan(float_status *status)
3798 {
3799     FloatParts64 p;
3800 
3801     parts_default_nan(&p, status);
3802     p.frac >>= float16_params.frac_shift;
3803     return float16_pack_raw(&p);
3804 }
3805 
3806 float32 float32_default_nan(float_status *status)
3807 {
3808     FloatParts64 p;
3809 
3810     parts_default_nan(&p, status);
3811     p.frac >>= float32_params.frac_shift;
3812     return float32_pack_raw(&p);
3813 }
3814 
3815 float64 float64_default_nan(float_status *status)
3816 {
3817     FloatParts64 p;
3818 
3819     parts_default_nan(&p, status);
3820     p.frac >>= float64_params.frac_shift;
3821     return float64_pack_raw(&p);
3822 }
3823 
3824 float128 float128_default_nan(float_status *status)
3825 {
3826     FloatParts128 p;
3827 
3828     parts_default_nan(&p, status);
3829     frac_shr(&p, float128_params.frac_shift);
3830     return float128_pack_raw(&p);
3831 }
3832 
3833 bfloat16 bfloat16_default_nan(float_status *status)
3834 {
3835     FloatParts64 p;
3836 
3837     parts_default_nan(&p, status);
3838     p.frac >>= bfloat16_params.frac_shift;
3839     return bfloat16_pack_raw(&p);
3840 }
3841 
3842 /*----------------------------------------------------------------------------
3843 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3844 *----------------------------------------------------------------------------*/
3845 
3846 float16 float16_silence_nan(float16 a, float_status *status)
3847 {
3848     FloatParts64 p;
3849 
3850     float16_unpack_raw(&p, a);
3851     p.frac <<= float16_params.frac_shift;
3852     parts_silence_nan(&p, status);
3853     p.frac >>= float16_params.frac_shift;
3854     return float16_pack_raw(&p);
3855 }
3856 
3857 float32 float32_silence_nan(float32 a, float_status *status)
3858 {
3859     FloatParts64 p;
3860 
3861     float32_unpack_raw(&p, a);
3862     p.frac <<= float32_params.frac_shift;
3863     parts_silence_nan(&p, status);
3864     p.frac >>= float32_params.frac_shift;
3865     return float32_pack_raw(&p);
3866 }
3867 
3868 float64 float64_silence_nan(float64 a, float_status *status)
3869 {
3870     FloatParts64 p;
3871 
3872     float64_unpack_raw(&p, a);
3873     p.frac <<= float64_params.frac_shift;
3874     parts_silence_nan(&p, status);
3875     p.frac >>= float64_params.frac_shift;
3876     return float64_pack_raw(&p);
3877 }
3878 
3879 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3880 {
3881     FloatParts64 p;
3882 
3883     bfloat16_unpack_raw(&p, a);
3884     p.frac <<= bfloat16_params.frac_shift;
3885     parts_silence_nan(&p, status);
3886     p.frac >>= bfloat16_params.frac_shift;
3887     return bfloat16_pack_raw(&p);
3888 }
3889 
3890 float128 float128_silence_nan(float128 a, float_status *status)
3891 {
3892     FloatParts128 p;
3893 
3894     float128_unpack_raw(&p, a);
3895     frac_shl(&p, float128_params.frac_shift);
3896     parts_silence_nan(&p, status);
3897     frac_shr(&p, float128_params.frac_shift);
3898     return float128_pack_raw(&p);
3899 }
3900 
3901 /*----------------------------------------------------------------------------
3902 | If `a' is denormal and we are in flush-to-zero mode then set the
3903 | input-denormal exception and return zero. Otherwise just return the value.
3904 *----------------------------------------------------------------------------*/
3905 
3906 static bool parts_squash_denormal(FloatParts64 p, float_status *status)
3907 {
3908     if (p.exp == 0 && p.frac != 0) {
3909         float_raise(float_flag_input_denormal, status);
3910         return true;
3911     }
3912 
3913     return false;
3914 }
3915 
3916 float16 float16_squash_input_denormal(float16 a, float_status *status)
3917 {
3918     if (status->flush_inputs_to_zero) {
3919         FloatParts64 p;
3920 
3921         float16_unpack_raw(&p, a);
3922         if (parts_squash_denormal(p, status)) {
3923             return float16_set_sign(float16_zero, p.sign);
3924         }
3925     }
3926     return a;
3927 }
3928 
3929 float32 float32_squash_input_denormal(float32 a, float_status *status)
3930 {
3931     if (status->flush_inputs_to_zero) {
3932         FloatParts64 p;
3933 
3934         float32_unpack_raw(&p, a);
3935         if (parts_squash_denormal(p, status)) {
3936             return float32_set_sign(float32_zero, p.sign);
3937         }
3938     }
3939     return a;
3940 }
3941 
3942 float64 float64_squash_input_denormal(float64 a, float_status *status)
3943 {
3944     if (status->flush_inputs_to_zero) {
3945         FloatParts64 p;
3946 
3947         float64_unpack_raw(&p, a);
3948         if (parts_squash_denormal(p, status)) {
3949             return float64_set_sign(float64_zero, p.sign);
3950         }
3951     }
3952     return a;
3953 }
3954 
3955 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3956 {
3957     if (status->flush_inputs_to_zero) {
3958         FloatParts64 p;
3959 
3960         bfloat16_unpack_raw(&p, a);
3961         if (parts_squash_denormal(p, status)) {
3962             return bfloat16_set_sign(bfloat16_zero, p.sign);
3963         }
3964     }
3965     return a;
3966 }
3967 
3968 /*----------------------------------------------------------------------------
3969 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3970 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3971 | input.  If `zSign' is 1, the input is negated before being converted to an
3972 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3973 | is simply rounded to an integer, with the inexact exception raised if the
3974 | input cannot be represented exactly as an integer.  However, if the fixed-
3975 | point input is too large, the invalid exception is raised and the largest
3976 | positive or negative integer is returned.
3977 *----------------------------------------------------------------------------*/
3978 
3979 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3980                                  float_status *status)
3981 {
3982     int8_t roundingMode;
3983     bool roundNearestEven;
3984     int8_t roundIncrement, roundBits;
3985     int32_t z;
3986 
3987     roundingMode = status->float_rounding_mode;
3988     roundNearestEven = ( roundingMode == float_round_nearest_even );
3989     switch (roundingMode) {
3990     case float_round_nearest_even:
3991     case float_round_ties_away:
3992         roundIncrement = 0x40;
3993         break;
3994     case float_round_to_zero:
3995         roundIncrement = 0;
3996         break;
3997     case float_round_up:
3998         roundIncrement = zSign ? 0 : 0x7f;
3999         break;
4000     case float_round_down:
4001         roundIncrement = zSign ? 0x7f : 0;
4002         break;
4003     case float_round_to_odd:
4004         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
4005         break;
4006     default:
4007         abort();
4008     }
4009     roundBits = absZ & 0x7F;
4010     absZ = ( absZ + roundIncrement )>>7;
4011     if (!(roundBits ^ 0x40) && roundNearestEven) {
4012         absZ &= ~1;
4013     }
4014     z = absZ;
4015     if ( zSign ) z = - z;
4016     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
4017         float_raise(float_flag_invalid, status);
4018         return zSign ? INT32_MIN : INT32_MAX;
4019     }
4020     if (roundBits) {
4021         float_raise(float_flag_inexact, status);
4022     }
4023     return z;
4024 
4025 }
4026 
4027 /*----------------------------------------------------------------------------
4028 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4029 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4030 | and returns the properly rounded 64-bit integer corresponding to the input.
4031 | If `zSign' is 1, the input is negated before being converted to an integer.
4032 | Ordinarily, the fixed-point input is simply rounded to an integer, with
4033 | the inexact exception raised if the input cannot be represented exactly as
4034 | an integer.  However, if the fixed-point input is too large, the invalid
4035 | exception is raised and the largest positive or negative integer is
4036 | returned.
4037 *----------------------------------------------------------------------------*/
4038 
4039 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
4040                                float_status *status)
4041 {
4042     int8_t roundingMode;
4043     bool roundNearestEven, increment;
4044     int64_t z;
4045 
4046     roundingMode = status->float_rounding_mode;
4047     roundNearestEven = ( roundingMode == float_round_nearest_even );
4048     switch (roundingMode) {
4049     case float_round_nearest_even:
4050     case float_round_ties_away:
4051         increment = ((int64_t) absZ1 < 0);
4052         break;
4053     case float_round_to_zero:
4054         increment = 0;
4055         break;
4056     case float_round_up:
4057         increment = !zSign && absZ1;
4058         break;
4059     case float_round_down:
4060         increment = zSign && absZ1;
4061         break;
4062     case float_round_to_odd:
4063         increment = !(absZ0 & 1) && absZ1;
4064         break;
4065     default:
4066         abort();
4067     }
4068     if ( increment ) {
4069         ++absZ0;
4070         if ( absZ0 == 0 ) goto overflow;
4071         if (!(absZ1 << 1) && roundNearestEven) {
4072             absZ0 &= ~1;
4073         }
4074     }
4075     z = absZ0;
4076     if ( zSign ) z = - z;
4077     if ( z && ( ( z < 0 ) ^ zSign ) ) {
4078  overflow:
4079         float_raise(float_flag_invalid, status);
4080         return zSign ? INT64_MIN : INT64_MAX;
4081     }
4082     if (absZ1) {
4083         float_raise(float_flag_inexact, status);
4084     }
4085     return z;
4086 
4087 }
4088 
4089 /*----------------------------------------------------------------------------
4090 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
4091 | `absZ1', with binary point between bits 63 and 64 (between the input words),
4092 | and returns the properly rounded 64-bit unsigned integer corresponding to the
4093 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
4094 | with the inexact exception raised if the input cannot be represented exactly
4095 | as an integer.  However, if the fixed-point input is too large, the invalid
4096 | exception is raised and the largest unsigned integer is returned.
4097 *----------------------------------------------------------------------------*/
4098 
4099 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
4100                                 uint64_t absZ1, float_status *status)
4101 {
4102     int8_t roundingMode;
4103     bool roundNearestEven, increment;
4104 
4105     roundingMode = status->float_rounding_mode;
4106     roundNearestEven = (roundingMode == float_round_nearest_even);
4107     switch (roundingMode) {
4108     case float_round_nearest_even:
4109     case float_round_ties_away:
4110         increment = ((int64_t)absZ1 < 0);
4111         break;
4112     case float_round_to_zero:
4113         increment = 0;
4114         break;
4115     case float_round_up:
4116         increment = !zSign && absZ1;
4117         break;
4118     case float_round_down:
4119         increment = zSign && absZ1;
4120         break;
4121     case float_round_to_odd:
4122         increment = !(absZ0 & 1) && absZ1;
4123         break;
4124     default:
4125         abort();
4126     }
4127     if (increment) {
4128         ++absZ0;
4129         if (absZ0 == 0) {
4130             float_raise(float_flag_invalid, status);
4131             return UINT64_MAX;
4132         }
4133         if (!(absZ1 << 1) && roundNearestEven) {
4134             absZ0 &= ~1;
4135         }
4136     }
4137 
4138     if (zSign && absZ0) {
4139         float_raise(float_flag_invalid, status);
4140         return 0;
4141     }
4142 
4143     if (absZ1) {
4144         float_raise(float_flag_inexact, status);
4145     }
4146     return absZ0;
4147 }
4148 
4149 /*----------------------------------------------------------------------------
4150 | Normalizes the subnormal single-precision floating-point value represented
4151 | by the denormalized significand `aSig'.  The normalized exponent and
4152 | significand are stored at the locations pointed to by `zExpPtr' and
4153 | `zSigPtr', respectively.
4154 *----------------------------------------------------------------------------*/
4155 
4156 static void
4157  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
4158 {
4159     int8_t shiftCount;
4160 
4161     shiftCount = clz32(aSig) - 8;
4162     *zSigPtr = aSig<<shiftCount;
4163     *zExpPtr = 1 - shiftCount;
4164 
4165 }
4166 
4167 /*----------------------------------------------------------------------------
4168 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4169 | and significand `zSig', and returns the proper single-precision floating-
4170 | point value corresponding to the abstract input.  Ordinarily, the abstract
4171 | value is simply rounded and packed into the single-precision format, with
4172 | the inexact exception raised if the abstract input cannot be represented
4173 | exactly.  However, if the abstract value is too large, the overflow and
4174 | inexact exceptions are raised and an infinity or maximal finite value is
4175 | returned.  If the abstract value is too small, the input value is rounded to
4176 | a subnormal number, and the underflow and inexact exceptions are raised if
4177 | the abstract input cannot be represented exactly as a subnormal single-
4178 | precision floating-point number.
4179 |     The input significand `zSig' has its binary point between bits 30
4180 | and 29, which is 7 bits to the left of the usual location.  This shifted
4181 | significand must be normalized or smaller.  If `zSig' is not normalized,
4182 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4183 | and it must not require rounding.  In the usual case that `zSig' is
4184 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4185 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4186 | Binary Floating-Point Arithmetic.
4187 *----------------------------------------------------------------------------*/
4188 
4189 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4190                                    float_status *status)
4191 {
4192     int8_t roundingMode;
4193     bool roundNearestEven;
4194     int8_t roundIncrement, roundBits;
4195     bool isTiny;
4196 
4197     roundingMode = status->float_rounding_mode;
4198     roundNearestEven = ( roundingMode == float_round_nearest_even );
4199     switch (roundingMode) {
4200     case float_round_nearest_even:
4201     case float_round_ties_away:
4202         roundIncrement = 0x40;
4203         break;
4204     case float_round_to_zero:
4205         roundIncrement = 0;
4206         break;
4207     case float_round_up:
4208         roundIncrement = zSign ? 0 : 0x7f;
4209         break;
4210     case float_round_down:
4211         roundIncrement = zSign ? 0x7f : 0;
4212         break;
4213     case float_round_to_odd:
4214         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4215         break;
4216     default:
4217         abort();
4218         break;
4219     }
4220     roundBits = zSig & 0x7F;
4221     if ( 0xFD <= (uint16_t) zExp ) {
4222         if (    ( 0xFD < zExp )
4223              || (    ( zExp == 0xFD )
4224                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4225            ) {
4226             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4227                                    roundIncrement != 0;
4228             float_raise(float_flag_overflow | float_flag_inexact, status);
4229             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4230         }
4231         if ( zExp < 0 ) {
4232             if (status->flush_to_zero) {
4233                 float_raise(float_flag_output_denormal, status);
4234                 return packFloat32(zSign, 0, 0);
4235             }
4236             isTiny = status->tininess_before_rounding
4237                   || (zExp < -1)
4238                   || (zSig + roundIncrement < 0x80000000);
4239             shift32RightJamming( zSig, - zExp, &zSig );
4240             zExp = 0;
4241             roundBits = zSig & 0x7F;
4242             if (isTiny && roundBits) {
4243                 float_raise(float_flag_underflow, status);
4244             }
4245             if (roundingMode == float_round_to_odd) {
4246                 /*
4247                  * For round-to-odd case, the roundIncrement depends on
4248                  * zSig which just changed.
4249                  */
4250                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4251             }
4252         }
4253     }
4254     if (roundBits) {
4255         float_raise(float_flag_inexact, status);
4256     }
4257     zSig = ( zSig + roundIncrement )>>7;
4258     if (!(roundBits ^ 0x40) && roundNearestEven) {
4259         zSig &= ~1;
4260     }
4261     if ( zSig == 0 ) zExp = 0;
4262     return packFloat32( zSign, zExp, zSig );
4263 
4264 }
4265 
4266 /*----------------------------------------------------------------------------
4267 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4268 | and significand `zSig', and returns the proper single-precision floating-
4269 | point value corresponding to the abstract input.  This routine is just like
4270 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4271 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4272 | floating-point exponent.
4273 *----------------------------------------------------------------------------*/
4274 
4275 static float32
4276  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4277                               float_status *status)
4278 {
4279     int8_t shiftCount;
4280 
4281     shiftCount = clz32(zSig) - 1;
4282     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4283                                status);
4284 
4285 }
4286 
4287 /*----------------------------------------------------------------------------
4288 | Normalizes the subnormal double-precision floating-point value represented
4289 | by the denormalized significand `aSig'.  The normalized exponent and
4290 | significand are stored at the locations pointed to by `zExpPtr' and
4291 | `zSigPtr', respectively.
4292 *----------------------------------------------------------------------------*/
4293 
4294 static void
4295  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4296 {
4297     int8_t shiftCount;
4298 
4299     shiftCount = clz64(aSig) - 11;
4300     *zSigPtr = aSig<<shiftCount;
4301     *zExpPtr = 1 - shiftCount;
4302 
4303 }
4304 
4305 /*----------------------------------------------------------------------------
4306 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4307 | double-precision floating-point value, returning the result.  After being
4308 | shifted into the proper positions, the three fields are simply added
4309 | together to form the result.  This means that any integer portion of `zSig'
4310 | will be added into the exponent.  Since a properly normalized significand
4311 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4312 | than the desired result exponent whenever `zSig' is a complete, normalized
4313 | significand.
4314 *----------------------------------------------------------------------------*/
4315 
4316 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4317 {
4318 
4319     return make_float64(
4320         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4321 
4322 }
4323 
4324 /*----------------------------------------------------------------------------
4325 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4326 | and significand `zSig', and returns the proper double-precision floating-
4327 | point value corresponding to the abstract input.  Ordinarily, the abstract
4328 | value is simply rounded and packed into the double-precision format, with
4329 | the inexact exception raised if the abstract input cannot be represented
4330 | exactly.  However, if the abstract value is too large, the overflow and
4331 | inexact exceptions are raised and an infinity or maximal finite value is
4332 | returned.  If the abstract value is too small, the input value is rounded to
4333 | a subnormal number, and the underflow and inexact exceptions are raised if
4334 | the abstract input cannot be represented exactly as a subnormal double-
4335 | precision floating-point number.
4336 |     The input significand `zSig' has its binary point between bits 62
4337 | and 61, which is 10 bits to the left of the usual location.  This shifted
4338 | significand must be normalized or smaller.  If `zSig' is not normalized,
4339 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4340 | and it must not require rounding.  In the usual case that `zSig' is
4341 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4342 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4343 | Binary Floating-Point Arithmetic.
4344 *----------------------------------------------------------------------------*/
4345 
4346 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4347                                    float_status *status)
4348 {
4349     int8_t roundingMode;
4350     bool roundNearestEven;
4351     int roundIncrement, roundBits;
4352     bool isTiny;
4353 
4354     roundingMode = status->float_rounding_mode;
4355     roundNearestEven = ( roundingMode == float_round_nearest_even );
4356     switch (roundingMode) {
4357     case float_round_nearest_even:
4358     case float_round_ties_away:
4359         roundIncrement = 0x200;
4360         break;
4361     case float_round_to_zero:
4362         roundIncrement = 0;
4363         break;
4364     case float_round_up:
4365         roundIncrement = zSign ? 0 : 0x3ff;
4366         break;
4367     case float_round_down:
4368         roundIncrement = zSign ? 0x3ff : 0;
4369         break;
4370     case float_round_to_odd:
4371         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4372         break;
4373     default:
4374         abort();
4375     }
4376     roundBits = zSig & 0x3FF;
4377     if ( 0x7FD <= (uint16_t) zExp ) {
4378         if (    ( 0x7FD < zExp )
4379              || (    ( zExp == 0x7FD )
4380                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4381            ) {
4382             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4383                                    roundIncrement != 0;
4384             float_raise(float_flag_overflow | float_flag_inexact, status);
4385             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4386         }
4387         if ( zExp < 0 ) {
4388             if (status->flush_to_zero) {
4389                 float_raise(float_flag_output_denormal, status);
4390                 return packFloat64(zSign, 0, 0);
4391             }
4392             isTiny = status->tininess_before_rounding
4393                   || (zExp < -1)
4394                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4395             shift64RightJamming( zSig, - zExp, &zSig );
4396             zExp = 0;
4397             roundBits = zSig & 0x3FF;
4398             if (isTiny && roundBits) {
4399                 float_raise(float_flag_underflow, status);
4400             }
4401             if (roundingMode == float_round_to_odd) {
4402                 /*
4403                  * For round-to-odd case, the roundIncrement depends on
4404                  * zSig which just changed.
4405                  */
4406                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4407             }
4408         }
4409     }
4410     if (roundBits) {
4411         float_raise(float_flag_inexact, status);
4412     }
4413     zSig = ( zSig + roundIncrement )>>10;
4414     if (!(roundBits ^ 0x200) && roundNearestEven) {
4415         zSig &= ~1;
4416     }
4417     if ( zSig == 0 ) zExp = 0;
4418     return packFloat64( zSign, zExp, zSig );
4419 
4420 }
4421 
4422 /*----------------------------------------------------------------------------
4423 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4424 | and significand `zSig', and returns the proper double-precision floating-
4425 | point value corresponding to the abstract input.  This routine is just like
4426 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4427 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4428 | floating-point exponent.
4429 *----------------------------------------------------------------------------*/
4430 
4431 static float64
4432  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4433                               float_status *status)
4434 {
4435     int8_t shiftCount;
4436 
4437     shiftCount = clz64(zSig) - 1;
4438     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4439                                status);
4440 
4441 }
4442 
4443 /*----------------------------------------------------------------------------
4444 | Normalizes the subnormal extended double-precision floating-point value
4445 | represented by the denormalized significand `aSig'.  The normalized exponent
4446 | and significand are stored at the locations pointed to by `zExpPtr' and
4447 | `zSigPtr', respectively.
4448 *----------------------------------------------------------------------------*/
4449 
4450 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4451                                 uint64_t *zSigPtr)
4452 {
4453     int8_t shiftCount;
4454 
4455     shiftCount = clz64(aSig);
4456     *zSigPtr = aSig<<shiftCount;
4457     *zExpPtr = 1 - shiftCount;
4458 }
4459 
4460 /*----------------------------------------------------------------------------
4461 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4462 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4463 | and returns the proper extended double-precision floating-point value
4464 | corresponding to the abstract input.  Ordinarily, the abstract value is
4465 | rounded and packed into the extended double-precision format, with the
4466 | inexact exception raised if the abstract input cannot be represented
4467 | exactly.  However, if the abstract value is too large, the overflow and
4468 | inexact exceptions are raised and an infinity or maximal finite value is
4469 | returned.  If the abstract value is too small, the input value is rounded to
4470 | a subnormal number, and the underflow and inexact exceptions are raised if
4471 | the abstract input cannot be represented exactly as a subnormal extended
4472 | double-precision floating-point number.
4473 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4474 | number of bits as single or double precision, respectively.  Otherwise, the
4475 | result is rounded to the full precision of the extended double-precision
4476 | format.
4477 |     The input significand must be normalized or smaller.  If the input
4478 | significand is not normalized, `zExp' must be 0; in that case, the result
4479 | returned is a subnormal number, and it must not require rounding.  The
4480 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4481 | Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483 
4484 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4485                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4486                               float_status *status)
4487 {
4488     int8_t roundingMode;
4489     bool roundNearestEven, increment, isTiny;
4490     int64_t roundIncrement, roundMask, roundBits;
4491 
4492     roundingMode = status->float_rounding_mode;
4493     roundNearestEven = ( roundingMode == float_round_nearest_even );
4494     if ( roundingPrecision == 80 ) goto precision80;
4495     if ( roundingPrecision == 64 ) {
4496         roundIncrement = UINT64_C(0x0000000000000400);
4497         roundMask = UINT64_C(0x00000000000007FF);
4498     }
4499     else if ( roundingPrecision == 32 ) {
4500         roundIncrement = UINT64_C(0x0000008000000000);
4501         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4502     }
4503     else {
4504         goto precision80;
4505     }
4506     zSig0 |= ( zSig1 != 0 );
4507     switch (roundingMode) {
4508     case float_round_nearest_even:
4509     case float_round_ties_away:
4510         break;
4511     case float_round_to_zero:
4512         roundIncrement = 0;
4513         break;
4514     case float_round_up:
4515         roundIncrement = zSign ? 0 : roundMask;
4516         break;
4517     case float_round_down:
4518         roundIncrement = zSign ? roundMask : 0;
4519         break;
4520     default:
4521         abort();
4522     }
4523     roundBits = zSig0 & roundMask;
4524     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4525         if (    ( 0x7FFE < zExp )
4526              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4527            ) {
4528             goto overflow;
4529         }
4530         if ( zExp <= 0 ) {
4531             if (status->flush_to_zero) {
4532                 float_raise(float_flag_output_denormal, status);
4533                 return packFloatx80(zSign, 0, 0);
4534             }
4535             isTiny = status->tininess_before_rounding
4536                   || (zExp < 0 )
4537                   || (zSig0 <= zSig0 + roundIncrement);
4538             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4539             zExp = 0;
4540             roundBits = zSig0 & roundMask;
4541             if (isTiny && roundBits) {
4542                 float_raise(float_flag_underflow, status);
4543             }
4544             if (roundBits) {
4545                 float_raise(float_flag_inexact, status);
4546             }
4547             zSig0 += roundIncrement;
4548             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4549             roundIncrement = roundMask + 1;
4550             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4551                 roundMask |= roundIncrement;
4552             }
4553             zSig0 &= ~ roundMask;
4554             return packFloatx80( zSign, zExp, zSig0 );
4555         }
4556     }
4557     if (roundBits) {
4558         float_raise(float_flag_inexact, status);
4559     }
4560     zSig0 += roundIncrement;
4561     if ( zSig0 < roundIncrement ) {
4562         ++zExp;
4563         zSig0 = UINT64_C(0x8000000000000000);
4564     }
4565     roundIncrement = roundMask + 1;
4566     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4567         roundMask |= roundIncrement;
4568     }
4569     zSig0 &= ~ roundMask;
4570     if ( zSig0 == 0 ) zExp = 0;
4571     return packFloatx80( zSign, zExp, zSig0 );
4572  precision80:
4573     switch (roundingMode) {
4574     case float_round_nearest_even:
4575     case float_round_ties_away:
4576         increment = ((int64_t)zSig1 < 0);
4577         break;
4578     case float_round_to_zero:
4579         increment = 0;
4580         break;
4581     case float_round_up:
4582         increment = !zSign && zSig1;
4583         break;
4584     case float_round_down:
4585         increment = zSign && zSig1;
4586         break;
4587     default:
4588         abort();
4589     }
4590     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4591         if (    ( 0x7FFE < zExp )
4592              || (    ( zExp == 0x7FFE )
4593                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4594                   && increment
4595                 )
4596            ) {
4597             roundMask = 0;
4598  overflow:
4599             float_raise(float_flag_overflow | float_flag_inexact, status);
4600             if (    ( roundingMode == float_round_to_zero )
4601                  || ( zSign && ( roundingMode == float_round_up ) )
4602                  || ( ! zSign && ( roundingMode == float_round_down ) )
4603                ) {
4604                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4605             }
4606             return packFloatx80(zSign,
4607                                 floatx80_infinity_high,
4608                                 floatx80_infinity_low);
4609         }
4610         if ( zExp <= 0 ) {
4611             isTiny = status->tininess_before_rounding
4612                   || (zExp < 0)
4613                   || !increment
4614                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4615             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4616             zExp = 0;
4617             if (isTiny && zSig1) {
4618                 float_raise(float_flag_underflow, status);
4619             }
4620             if (zSig1) {
4621                 float_raise(float_flag_inexact, status);
4622             }
4623             switch (roundingMode) {
4624             case float_round_nearest_even:
4625             case float_round_ties_away:
4626                 increment = ((int64_t)zSig1 < 0);
4627                 break;
4628             case float_round_to_zero:
4629                 increment = 0;
4630                 break;
4631             case float_round_up:
4632                 increment = !zSign && zSig1;
4633                 break;
4634             case float_round_down:
4635                 increment = zSign && zSig1;
4636                 break;
4637             default:
4638                 abort();
4639             }
4640             if ( increment ) {
4641                 ++zSig0;
4642                 if (!(zSig1 << 1) && roundNearestEven) {
4643                     zSig0 &= ~1;
4644                 }
4645                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4646             }
4647             return packFloatx80( zSign, zExp, zSig0 );
4648         }
4649     }
4650     if (zSig1) {
4651         float_raise(float_flag_inexact, status);
4652     }
4653     if ( increment ) {
4654         ++zSig0;
4655         if ( zSig0 == 0 ) {
4656             ++zExp;
4657             zSig0 = UINT64_C(0x8000000000000000);
4658         }
4659         else {
4660             if (!(zSig1 << 1) && roundNearestEven) {
4661                 zSig0 &= ~1;
4662             }
4663         }
4664     }
4665     else {
4666         if ( zSig0 == 0 ) zExp = 0;
4667     }
4668     return packFloatx80( zSign, zExp, zSig0 );
4669 
4670 }
4671 
4672 /*----------------------------------------------------------------------------
4673 | Takes an abstract floating-point value having sign `zSign', exponent
4674 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4675 | and returns the proper extended double-precision floating-point value
4676 | corresponding to the abstract input.  This routine is just like
4677 | `roundAndPackFloatx80' except that the input significand does not have to be
4678 | normalized.
4679 *----------------------------------------------------------------------------*/
4680 
4681 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4682                                        bool zSign, int32_t zExp,
4683                                        uint64_t zSig0, uint64_t zSig1,
4684                                        float_status *status)
4685 {
4686     int8_t shiftCount;
4687 
4688     if ( zSig0 == 0 ) {
4689         zSig0 = zSig1;
4690         zSig1 = 0;
4691         zExp -= 64;
4692     }
4693     shiftCount = clz64(zSig0);
4694     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4695     zExp -= shiftCount;
4696     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4697                                 zSig0, zSig1, status);
4698 
4699 }
4700 
4701 /*----------------------------------------------------------------------------
4702 | Returns the least-significant 64 fraction bits of the quadruple-precision
4703 | floating-point value `a'.
4704 *----------------------------------------------------------------------------*/
4705 
4706 static inline uint64_t extractFloat128Frac1( float128 a )
4707 {
4708 
4709     return a.low;
4710 
4711 }
4712 
4713 /*----------------------------------------------------------------------------
4714 | Returns the most-significant 48 fraction bits of the quadruple-precision
4715 | floating-point value `a'.
4716 *----------------------------------------------------------------------------*/
4717 
4718 static inline uint64_t extractFloat128Frac0( float128 a )
4719 {
4720 
4721     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4722 
4723 }
4724 
4725 /*----------------------------------------------------------------------------
4726 | Returns the exponent bits of the quadruple-precision floating-point value
4727 | `a'.
4728 *----------------------------------------------------------------------------*/
4729 
4730 static inline int32_t extractFloat128Exp( float128 a )
4731 {
4732 
4733     return ( a.high>>48 ) & 0x7FFF;
4734 
4735 }
4736 
4737 /*----------------------------------------------------------------------------
4738 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4739 *----------------------------------------------------------------------------*/
4740 
4741 static inline bool extractFloat128Sign(float128 a)
4742 {
4743     return a.high >> 63;
4744 }
4745 
4746 /*----------------------------------------------------------------------------
4747 | Normalizes the subnormal quadruple-precision floating-point value
4748 | represented by the denormalized significand formed by the concatenation of
4749 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4750 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4751 | significand are stored at the location pointed to by `zSig0Ptr', and the
4752 | least significant 64 bits of the normalized significand are stored at the
4753 | location pointed to by `zSig1Ptr'.
4754 *----------------------------------------------------------------------------*/
4755 
4756 static void
4757  normalizeFloat128Subnormal(
4758      uint64_t aSig0,
4759      uint64_t aSig1,
4760      int32_t *zExpPtr,
4761      uint64_t *zSig0Ptr,
4762      uint64_t *zSig1Ptr
4763  )
4764 {
4765     int8_t shiftCount;
4766 
4767     if ( aSig0 == 0 ) {
4768         shiftCount = clz64(aSig1) - 15;
4769         if ( shiftCount < 0 ) {
4770             *zSig0Ptr = aSig1>>( - shiftCount );
4771             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4772         }
4773         else {
4774             *zSig0Ptr = aSig1<<shiftCount;
4775             *zSig1Ptr = 0;
4776         }
4777         *zExpPtr = - shiftCount - 63;
4778     }
4779     else {
4780         shiftCount = clz64(aSig0) - 15;
4781         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4782         *zExpPtr = 1 - shiftCount;
4783     }
4784 
4785 }
4786 
4787 /*----------------------------------------------------------------------------
4788 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4789 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4790 | floating-point value, returning the result.  After being shifted into the
4791 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4792 | added together to form the most significant 32 bits of the result.  This
4793 | means that any integer portion of `zSig0' will be added into the exponent.
4794 | Since a properly normalized significand will have an integer portion equal
4795 | to 1, the `zExp' input should be 1 less than the desired result exponent
4796 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4797 | significand.
4798 *----------------------------------------------------------------------------*/
4799 
4800 static inline float128
4801 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4802 {
4803     float128 z;
4804 
4805     z.low = zSig1;
4806     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4807     return z;
4808 }
4809 
4810 /*----------------------------------------------------------------------------
4811 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4812 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4813 | and `zSig2', and returns the proper quadruple-precision floating-point value
4814 | corresponding to the abstract input.  Ordinarily, the abstract value is
4815 | simply rounded and packed into the quadruple-precision format, with the
4816 | inexact exception raised if the abstract input cannot be represented
4817 | exactly.  However, if the abstract value is too large, the overflow and
4818 | inexact exceptions are raised and an infinity or maximal finite value is
4819 | returned.  If the abstract value is too small, the input value is rounded to
4820 | a subnormal number, and the underflow and inexact exceptions are raised if
4821 | the abstract input cannot be represented exactly as a subnormal quadruple-
4822 | precision floating-point number.
4823 |     The input significand must be normalized or smaller.  If the input
4824 | significand is not normalized, `zExp' must be 0; in that case, the result
4825 | returned is a subnormal number, and it must not require rounding.  In the
4826 | usual case that the input significand is normalized, `zExp' must be 1 less
4827 | than the ``true'' floating-point exponent.  The handling of underflow and
4828 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4830 
4831 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4832                                      uint64_t zSig0, uint64_t zSig1,
4833                                      uint64_t zSig2, float_status *status)
4834 {
4835     int8_t roundingMode;
4836     bool roundNearestEven, increment, isTiny;
4837 
4838     roundingMode = status->float_rounding_mode;
4839     roundNearestEven = ( roundingMode == float_round_nearest_even );
4840     switch (roundingMode) {
4841     case float_round_nearest_even:
4842     case float_round_ties_away:
4843         increment = ((int64_t)zSig2 < 0);
4844         break;
4845     case float_round_to_zero:
4846         increment = 0;
4847         break;
4848     case float_round_up:
4849         increment = !zSign && zSig2;
4850         break;
4851     case float_round_down:
4852         increment = zSign && zSig2;
4853         break;
4854     case float_round_to_odd:
4855         increment = !(zSig1 & 0x1) && zSig2;
4856         break;
4857     default:
4858         abort();
4859     }
4860     if ( 0x7FFD <= (uint32_t) zExp ) {
4861         if (    ( 0x7FFD < zExp )
4862              || (    ( zExp == 0x7FFD )
4863                   && eq128(
4864                          UINT64_C(0x0001FFFFFFFFFFFF),
4865                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4866                          zSig0,
4867                          zSig1
4868                      )
4869                   && increment
4870                 )
4871            ) {
4872             float_raise(float_flag_overflow | float_flag_inexact, status);
4873             if (    ( roundingMode == float_round_to_zero )
4874                  || ( zSign && ( roundingMode == float_round_up ) )
4875                  || ( ! zSign && ( roundingMode == float_round_down ) )
4876                  || (roundingMode == float_round_to_odd)
4877                ) {
4878                 return
4879                     packFloat128(
4880                         zSign,
4881                         0x7FFE,
4882                         UINT64_C(0x0000FFFFFFFFFFFF),
4883                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4884                     );
4885             }
4886             return packFloat128( zSign, 0x7FFF, 0, 0 );
4887         }
4888         if ( zExp < 0 ) {
4889             if (status->flush_to_zero) {
4890                 float_raise(float_flag_output_denormal, status);
4891                 return packFloat128(zSign, 0, 0, 0);
4892             }
4893             isTiny = status->tininess_before_rounding
4894                   || (zExp < -1)
4895                   || !increment
4896                   || lt128(zSig0, zSig1,
4897                            UINT64_C(0x0001FFFFFFFFFFFF),
4898                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4899             shift128ExtraRightJamming(
4900                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4901             zExp = 0;
4902             if (isTiny && zSig2) {
4903                 float_raise(float_flag_underflow, status);
4904             }
4905             switch (roundingMode) {
4906             case float_round_nearest_even:
4907             case float_round_ties_away:
4908                 increment = ((int64_t)zSig2 < 0);
4909                 break;
4910             case float_round_to_zero:
4911                 increment = 0;
4912                 break;
4913             case float_round_up:
4914                 increment = !zSign && zSig2;
4915                 break;
4916             case float_round_down:
4917                 increment = zSign && zSig2;
4918                 break;
4919             case float_round_to_odd:
4920                 increment = !(zSig1 & 0x1) && zSig2;
4921                 break;
4922             default:
4923                 abort();
4924             }
4925         }
4926     }
4927     if (zSig2) {
4928         float_raise(float_flag_inexact, status);
4929     }
4930     if ( increment ) {
4931         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4932         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4933             zSig1 &= ~1;
4934         }
4935     }
4936     else {
4937         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4938     }
4939     return packFloat128( zSign, zExp, zSig0, zSig1 );
4940 
4941 }
4942 
4943 /*----------------------------------------------------------------------------
4944 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4945 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4946 | returns the proper quadruple-precision floating-point value corresponding
4947 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4948 | except that the input significand has fewer bits and does not have to be
4949 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4950 | point exponent.
4951 *----------------------------------------------------------------------------*/
4952 
4953 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4954                                               uint64_t zSig0, uint64_t zSig1,
4955                                               float_status *status)
4956 {
4957     int8_t shiftCount;
4958     uint64_t zSig2;
4959 
4960     if ( zSig0 == 0 ) {
4961         zSig0 = zSig1;
4962         zSig1 = 0;
4963         zExp -= 64;
4964     }
4965     shiftCount = clz64(zSig0) - 15;
4966     if ( 0 <= shiftCount ) {
4967         zSig2 = 0;
4968         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4969     }
4970     else {
4971         shift128ExtraRightJamming(
4972             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4973     }
4974     zExp -= shiftCount;
4975     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4976 
4977 }
4978 
4979 
4980 /*----------------------------------------------------------------------------
4981 | Returns the result of converting the 32-bit two's complement integer `a'
4982 | to the extended double-precision floating-point format.  The conversion
4983 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4984 | Arithmetic.
4985 *----------------------------------------------------------------------------*/
4986 
4987 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4988 {
4989     bool zSign;
4990     uint32_t absA;
4991     int8_t shiftCount;
4992     uint64_t zSig;
4993 
4994     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4995     zSign = ( a < 0 );
4996     absA = zSign ? - a : a;
4997     shiftCount = clz32(absA) + 32;
4998     zSig = absA;
4999     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
5000 
5001 }
5002 
5003 /*----------------------------------------------------------------------------
5004 | Returns the result of converting the 32-bit two's complement integer `a' to
5005 | the quadruple-precision floating-point format.  The conversion is performed
5006 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5007 *----------------------------------------------------------------------------*/
5008 
5009 float128 int32_to_float128(int32_t a, float_status *status)
5010 {
5011     bool zSign;
5012     uint32_t absA;
5013     int8_t shiftCount;
5014     uint64_t zSig0;
5015 
5016     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5017     zSign = ( a < 0 );
5018     absA = zSign ? - a : a;
5019     shiftCount = clz32(absA) + 17;
5020     zSig0 = absA;
5021     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
5022 
5023 }
5024 
5025 /*----------------------------------------------------------------------------
5026 | Returns the result of converting the 64-bit two's complement integer `a'
5027 | to the extended double-precision floating-point format.  The conversion
5028 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5029 | Arithmetic.
5030 *----------------------------------------------------------------------------*/
5031 
5032 floatx80 int64_to_floatx80(int64_t a, float_status *status)
5033 {
5034     bool zSign;
5035     uint64_t absA;
5036     int8_t shiftCount;
5037 
5038     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
5039     zSign = ( a < 0 );
5040     absA = zSign ? - a : a;
5041     shiftCount = clz64(absA);
5042     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
5043 
5044 }
5045 
5046 /*----------------------------------------------------------------------------
5047 | Returns the result of converting the 64-bit two's complement integer `a' to
5048 | the quadruple-precision floating-point format.  The conversion is performed
5049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5050 *----------------------------------------------------------------------------*/
5051 
5052 float128 int64_to_float128(int64_t a, float_status *status)
5053 {
5054     bool zSign;
5055     uint64_t absA;
5056     int8_t shiftCount;
5057     int32_t zExp;
5058     uint64_t zSig0, zSig1;
5059 
5060     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
5061     zSign = ( a < 0 );
5062     absA = zSign ? - a : a;
5063     shiftCount = clz64(absA) + 49;
5064     zExp = 0x406E - shiftCount;
5065     if ( 64 <= shiftCount ) {
5066         zSig1 = 0;
5067         zSig0 = absA;
5068         shiftCount -= 64;
5069     }
5070     else {
5071         zSig1 = absA;
5072         zSig0 = 0;
5073     }
5074     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
5075     return packFloat128( zSign, zExp, zSig0, zSig1 );
5076 
5077 }
5078 
5079 /*----------------------------------------------------------------------------
5080 | Returns the result of converting the 64-bit unsigned integer `a'
5081 | to the quadruple-precision floating-point format.  The conversion is performed
5082 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5083 *----------------------------------------------------------------------------*/
5084 
5085 float128 uint64_to_float128(uint64_t a, float_status *status)
5086 {
5087     if (a == 0) {
5088         return float128_zero;
5089     }
5090     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
5091 }
5092 
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of converting the single-precision floating-point value
5095 | `a' to the extended double-precision floating-point format.  The conversion
5096 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5097 | Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 
5100 floatx80 float32_to_floatx80(float32 a, float_status *status)
5101 {
5102     bool aSign;
5103     int aExp;
5104     uint32_t aSig;
5105 
5106     a = float32_squash_input_denormal(a, status);
5107     aSig = extractFloat32Frac( a );
5108     aExp = extractFloat32Exp( a );
5109     aSign = extractFloat32Sign( a );
5110     if ( aExp == 0xFF ) {
5111         if (aSig) {
5112             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
5113                                                status);
5114             return floatx80_silence_nan(res, status);
5115         }
5116         return packFloatx80(aSign,
5117                             floatx80_infinity_high,
5118                             floatx80_infinity_low);
5119     }
5120     if ( aExp == 0 ) {
5121         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5122         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5123     }
5124     aSig |= 0x00800000;
5125     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
5126 
5127 }
5128 
5129 /*----------------------------------------------------------------------------
5130 | Returns the result of converting the single-precision floating-point value
5131 | `a' to the double-precision floating-point format.  The conversion is
5132 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5133 | Arithmetic.
5134 *----------------------------------------------------------------------------*/
5135 
5136 float128 float32_to_float128(float32 a, float_status *status)
5137 {
5138     bool aSign;
5139     int aExp;
5140     uint32_t aSig;
5141 
5142     a = float32_squash_input_denormal(a, status);
5143     aSig = extractFloat32Frac( a );
5144     aExp = extractFloat32Exp( a );
5145     aSign = extractFloat32Sign( a );
5146     if ( aExp == 0xFF ) {
5147         if (aSig) {
5148             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
5149         }
5150         return packFloat128( aSign, 0x7FFF, 0, 0 );
5151     }
5152     if ( aExp == 0 ) {
5153         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5154         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5155         --aExp;
5156     }
5157     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
5158 
5159 }
5160 
5161 /*----------------------------------------------------------------------------
5162 | Returns the remainder of the single-precision floating-point value `a'
5163 | with respect to the corresponding value `b'.  The operation is performed
5164 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5165 *----------------------------------------------------------------------------*/
5166 
5167 float32 float32_rem(float32 a, float32 b, float_status *status)
5168 {
5169     bool aSign, zSign;
5170     int aExp, bExp, expDiff;
5171     uint32_t aSig, bSig;
5172     uint32_t q;
5173     uint64_t aSig64, bSig64, q64;
5174     uint32_t alternateASig;
5175     int32_t sigMean;
5176     a = float32_squash_input_denormal(a, status);
5177     b = float32_squash_input_denormal(b, status);
5178 
5179     aSig = extractFloat32Frac( a );
5180     aExp = extractFloat32Exp( a );
5181     aSign = extractFloat32Sign( a );
5182     bSig = extractFloat32Frac( b );
5183     bExp = extractFloat32Exp( b );
5184     if ( aExp == 0xFF ) {
5185         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
5186             return propagateFloat32NaN(a, b, status);
5187         }
5188         float_raise(float_flag_invalid, status);
5189         return float32_default_nan(status);
5190     }
5191     if ( bExp == 0xFF ) {
5192         if (bSig) {
5193             return propagateFloat32NaN(a, b, status);
5194         }
5195         return a;
5196     }
5197     if ( bExp == 0 ) {
5198         if ( bSig == 0 ) {
5199             float_raise(float_flag_invalid, status);
5200             return float32_default_nan(status);
5201         }
5202         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
5203     }
5204     if ( aExp == 0 ) {
5205         if ( aSig == 0 ) return a;
5206         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5207     }
5208     expDiff = aExp - bExp;
5209     aSig |= 0x00800000;
5210     bSig |= 0x00800000;
5211     if ( expDiff < 32 ) {
5212         aSig <<= 8;
5213         bSig <<= 8;
5214         if ( expDiff < 0 ) {
5215             if ( expDiff < -1 ) return a;
5216             aSig >>= 1;
5217         }
5218         q = ( bSig <= aSig );
5219         if ( q ) aSig -= bSig;
5220         if ( 0 < expDiff ) {
5221             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5222             q >>= 32 - expDiff;
5223             bSig >>= 2;
5224             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5225         }
5226         else {
5227             aSig >>= 2;
5228             bSig >>= 2;
5229         }
5230     }
5231     else {
5232         if ( bSig <= aSig ) aSig -= bSig;
5233         aSig64 = ( (uint64_t) aSig )<<40;
5234         bSig64 = ( (uint64_t) bSig )<<40;
5235         expDiff -= 64;
5236         while ( 0 < expDiff ) {
5237             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5238             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5239             aSig64 = - ( ( bSig * q64 )<<38 );
5240             expDiff -= 62;
5241         }
5242         expDiff += 64;
5243         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5244         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5245         q = q64>>( 64 - expDiff );
5246         bSig <<= 6;
5247         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5248     }
5249     do {
5250         alternateASig = aSig;
5251         ++q;
5252         aSig -= bSig;
5253     } while ( 0 <= (int32_t) aSig );
5254     sigMean = aSig + alternateASig;
5255     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5256         aSig = alternateASig;
5257     }
5258     zSign = ( (int32_t) aSig < 0 );
5259     if ( zSign ) aSig = - aSig;
5260     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5261 }
5262 
5263 
5264 
5265 /*----------------------------------------------------------------------------
5266 | Returns the binary exponential of the single-precision floating-point value
5267 | `a'. The operation is performed according to the IEC/IEEE Standard for
5268 | Binary Floating-Point Arithmetic.
5269 |
5270 | Uses the following identities:
5271 |
5272 | 1. -------------------------------------------------------------------------
5273 |      x    x*ln(2)
5274 |     2  = e
5275 |
5276 | 2. -------------------------------------------------------------------------
5277 |                      2     3     4     5           n
5278 |      x        x     x     x     x     x           x
5279 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5280 |               1!    2!    3!    4!    5!          n!
5281 *----------------------------------------------------------------------------*/
5282 
5283 static const float64 float32_exp2_coefficients[15] =
5284 {
5285     const_float64( 0x3ff0000000000000ll ), /*  1 */
5286     const_float64( 0x3fe0000000000000ll ), /*  2 */
5287     const_float64( 0x3fc5555555555555ll ), /*  3 */
5288     const_float64( 0x3fa5555555555555ll ), /*  4 */
5289     const_float64( 0x3f81111111111111ll ), /*  5 */
5290     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5291     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5292     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5293     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5294     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5295     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5296     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5297     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5298     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5299     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5300 };
5301 
5302 float32 float32_exp2(float32 a, float_status *status)
5303 {
5304     bool aSign;
5305     int aExp;
5306     uint32_t aSig;
5307     float64 r, x, xn;
5308     int i;
5309     a = float32_squash_input_denormal(a, status);
5310 
5311     aSig = extractFloat32Frac( a );
5312     aExp = extractFloat32Exp( a );
5313     aSign = extractFloat32Sign( a );
5314 
5315     if ( aExp == 0xFF) {
5316         if (aSig) {
5317             return propagateFloat32NaN(a, float32_zero, status);
5318         }
5319         return (aSign) ? float32_zero : a;
5320     }
5321     if (aExp == 0) {
5322         if (aSig == 0) return float32_one;
5323     }
5324 
5325     float_raise(float_flag_inexact, status);
5326 
5327     /* ******************************* */
5328     /* using float64 for approximation */
5329     /* ******************************* */
5330     x = float32_to_float64(a, status);
5331     x = float64_mul(x, float64_ln2, status);
5332 
5333     xn = x;
5334     r = float64_one;
5335     for (i = 0 ; i < 15 ; i++) {
5336         float64 f;
5337 
5338         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5339         r = float64_add(r, f, status);
5340 
5341         xn = float64_mul(xn, x, status);
5342     }
5343 
5344     return float64_to_float32(r, status);
5345 }
5346 
5347 /*----------------------------------------------------------------------------
5348 | Returns the binary log of the single-precision floating-point value `a'.
5349 | The operation is performed according to the IEC/IEEE Standard for Binary
5350 | Floating-Point Arithmetic.
5351 *----------------------------------------------------------------------------*/
5352 float32 float32_log2(float32 a, float_status *status)
5353 {
5354     bool aSign, zSign;
5355     int aExp;
5356     uint32_t aSig, zSig, i;
5357 
5358     a = float32_squash_input_denormal(a, status);
5359     aSig = extractFloat32Frac( a );
5360     aExp = extractFloat32Exp( a );
5361     aSign = extractFloat32Sign( a );
5362 
5363     if ( aExp == 0 ) {
5364         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5365         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5366     }
5367     if ( aSign ) {
5368         float_raise(float_flag_invalid, status);
5369         return float32_default_nan(status);
5370     }
5371     if ( aExp == 0xFF ) {
5372         if (aSig) {
5373             return propagateFloat32NaN(a, float32_zero, status);
5374         }
5375         return a;
5376     }
5377 
5378     aExp -= 0x7F;
5379     aSig |= 0x00800000;
5380     zSign = aExp < 0;
5381     zSig = aExp << 23;
5382 
5383     for (i = 1 << 22; i > 0; i >>= 1) {
5384         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5385         if ( aSig & 0x01000000 ) {
5386             aSig >>= 1;
5387             zSig |= i;
5388         }
5389     }
5390 
5391     if ( zSign )
5392         zSig = -zSig;
5393 
5394     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5395 }
5396 
5397 /*----------------------------------------------------------------------------
5398 | Returns the result of converting the double-precision floating-point value
5399 | `a' to the extended double-precision floating-point format.  The conversion
5400 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5401 | Arithmetic.
5402 *----------------------------------------------------------------------------*/
5403 
5404 floatx80 float64_to_floatx80(float64 a, float_status *status)
5405 {
5406     bool aSign;
5407     int aExp;
5408     uint64_t aSig;
5409 
5410     a = float64_squash_input_denormal(a, status);
5411     aSig = extractFloat64Frac( a );
5412     aExp = extractFloat64Exp( a );
5413     aSign = extractFloat64Sign( a );
5414     if ( aExp == 0x7FF ) {
5415         if (aSig) {
5416             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5417                                                status);
5418             return floatx80_silence_nan(res, status);
5419         }
5420         return packFloatx80(aSign,
5421                             floatx80_infinity_high,
5422                             floatx80_infinity_low);
5423     }
5424     if ( aExp == 0 ) {
5425         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5426         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5427     }
5428     return
5429         packFloatx80(
5430             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5431 
5432 }
5433 
5434 /*----------------------------------------------------------------------------
5435 | Returns the result of converting the double-precision floating-point value
5436 | `a' to the quadruple-precision floating-point format.  The conversion is
5437 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5438 | Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 
5441 float128 float64_to_float128(float64 a, float_status *status)
5442 {
5443     bool aSign;
5444     int aExp;
5445     uint64_t aSig, zSig0, zSig1;
5446 
5447     a = float64_squash_input_denormal(a, status);
5448     aSig = extractFloat64Frac( a );
5449     aExp = extractFloat64Exp( a );
5450     aSign = extractFloat64Sign( a );
5451     if ( aExp == 0x7FF ) {
5452         if (aSig) {
5453             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5454         }
5455         return packFloat128( aSign, 0x7FFF, 0, 0 );
5456     }
5457     if ( aExp == 0 ) {
5458         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5459         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5460         --aExp;
5461     }
5462     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5463     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5464 
5465 }
5466 
5467 
5468 /*----------------------------------------------------------------------------
5469 | Returns the remainder of the double-precision floating-point value `a'
5470 | with respect to the corresponding value `b'.  The operation is performed
5471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5472 *----------------------------------------------------------------------------*/
5473 
5474 float64 float64_rem(float64 a, float64 b, float_status *status)
5475 {
5476     bool aSign, zSign;
5477     int aExp, bExp, expDiff;
5478     uint64_t aSig, bSig;
5479     uint64_t q, alternateASig;
5480     int64_t sigMean;
5481 
5482     a = float64_squash_input_denormal(a, status);
5483     b = float64_squash_input_denormal(b, status);
5484     aSig = extractFloat64Frac( a );
5485     aExp = extractFloat64Exp( a );
5486     aSign = extractFloat64Sign( a );
5487     bSig = extractFloat64Frac( b );
5488     bExp = extractFloat64Exp( b );
5489     if ( aExp == 0x7FF ) {
5490         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5491             return propagateFloat64NaN(a, b, status);
5492         }
5493         float_raise(float_flag_invalid, status);
5494         return float64_default_nan(status);
5495     }
5496     if ( bExp == 0x7FF ) {
5497         if (bSig) {
5498             return propagateFloat64NaN(a, b, status);
5499         }
5500         return a;
5501     }
5502     if ( bExp == 0 ) {
5503         if ( bSig == 0 ) {
5504             float_raise(float_flag_invalid, status);
5505             return float64_default_nan(status);
5506         }
5507         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5508     }
5509     if ( aExp == 0 ) {
5510         if ( aSig == 0 ) return a;
5511         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5512     }
5513     expDiff = aExp - bExp;
5514     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5515     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5516     if ( expDiff < 0 ) {
5517         if ( expDiff < -1 ) return a;
5518         aSig >>= 1;
5519     }
5520     q = ( bSig <= aSig );
5521     if ( q ) aSig -= bSig;
5522     expDiff -= 64;
5523     while ( 0 < expDiff ) {
5524         q = estimateDiv128To64( aSig, 0, bSig );
5525         q = ( 2 < q ) ? q - 2 : 0;
5526         aSig = - ( ( bSig>>2 ) * q );
5527         expDiff -= 62;
5528     }
5529     expDiff += 64;
5530     if ( 0 < expDiff ) {
5531         q = estimateDiv128To64( aSig, 0, bSig );
5532         q = ( 2 < q ) ? q - 2 : 0;
5533         q >>= 64 - expDiff;
5534         bSig >>= 2;
5535         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5536     }
5537     else {
5538         aSig >>= 2;
5539         bSig >>= 2;
5540     }
5541     do {
5542         alternateASig = aSig;
5543         ++q;
5544         aSig -= bSig;
5545     } while ( 0 <= (int64_t) aSig );
5546     sigMean = aSig + alternateASig;
5547     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5548         aSig = alternateASig;
5549     }
5550     zSign = ( (int64_t) aSig < 0 );
5551     if ( zSign ) aSig = - aSig;
5552     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5553 
5554 }
5555 
5556 /*----------------------------------------------------------------------------
5557 | Returns the binary log of the double-precision floating-point value `a'.
5558 | The operation is performed according to the IEC/IEEE Standard for Binary
5559 | Floating-Point Arithmetic.
5560 *----------------------------------------------------------------------------*/
5561 float64 float64_log2(float64 a, float_status *status)
5562 {
5563     bool aSign, zSign;
5564     int aExp;
5565     uint64_t aSig, aSig0, aSig1, zSig, i;
5566     a = float64_squash_input_denormal(a, status);
5567 
5568     aSig = extractFloat64Frac( a );
5569     aExp = extractFloat64Exp( a );
5570     aSign = extractFloat64Sign( a );
5571 
5572     if ( aExp == 0 ) {
5573         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5574         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5575     }
5576     if ( aSign ) {
5577         float_raise(float_flag_invalid, status);
5578         return float64_default_nan(status);
5579     }
5580     if ( aExp == 0x7FF ) {
5581         if (aSig) {
5582             return propagateFloat64NaN(a, float64_zero, status);
5583         }
5584         return a;
5585     }
5586 
5587     aExp -= 0x3FF;
5588     aSig |= UINT64_C(0x0010000000000000);
5589     zSign = aExp < 0;
5590     zSig = (uint64_t)aExp << 52;
5591     for (i = 1LL << 51; i > 0; i >>= 1) {
5592         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5593         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5594         if ( aSig & UINT64_C(0x0020000000000000) ) {
5595             aSig >>= 1;
5596             zSig |= i;
5597         }
5598     }
5599 
5600     if ( zSign )
5601         zSig = -zSig;
5602     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5603 }
5604 
5605 /*----------------------------------------------------------------------------
5606 | Returns the result of converting the extended double-precision floating-
5607 | point value `a' to the 32-bit two's complement integer format.  The
5608 | conversion is performed according to the IEC/IEEE Standard for Binary
5609 | Floating-Point Arithmetic---which means in particular that the conversion
5610 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5611 | largest positive integer is returned.  Otherwise, if the conversion
5612 | overflows, the largest integer with the same sign as `a' is returned.
5613 *----------------------------------------------------------------------------*/
5614 
5615 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5616 {
5617     bool aSign;
5618     int32_t aExp, shiftCount;
5619     uint64_t aSig;
5620 
5621     if (floatx80_invalid_encoding(a)) {
5622         float_raise(float_flag_invalid, status);
5623         return 1 << 31;
5624     }
5625     aSig = extractFloatx80Frac( a );
5626     aExp = extractFloatx80Exp( a );
5627     aSign = extractFloatx80Sign( a );
5628     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5629     shiftCount = 0x4037 - aExp;
5630     if ( shiftCount <= 0 ) shiftCount = 1;
5631     shift64RightJamming( aSig, shiftCount, &aSig );
5632     return roundAndPackInt32(aSign, aSig, status);
5633 
5634 }
5635 
5636 /*----------------------------------------------------------------------------
5637 | Returns the result of converting the extended double-precision floating-
5638 | point value `a' to the 32-bit two's complement integer format.  The
5639 | conversion is performed according to the IEC/IEEE Standard for Binary
5640 | Floating-Point Arithmetic, except that the conversion is always rounded
5641 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5642 | Otherwise, if the conversion overflows, the largest integer with the same
5643 | sign as `a' is returned.
5644 *----------------------------------------------------------------------------*/
5645 
5646 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5647 {
5648     bool aSign;
5649     int32_t aExp, shiftCount;
5650     uint64_t aSig, savedASig;
5651     int32_t z;
5652 
5653     if (floatx80_invalid_encoding(a)) {
5654         float_raise(float_flag_invalid, status);
5655         return 1 << 31;
5656     }
5657     aSig = extractFloatx80Frac( a );
5658     aExp = extractFloatx80Exp( a );
5659     aSign = extractFloatx80Sign( a );
5660     if ( 0x401E < aExp ) {
5661         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5662         goto invalid;
5663     }
5664     else if ( aExp < 0x3FFF ) {
5665         if (aExp || aSig) {
5666             float_raise(float_flag_inexact, status);
5667         }
5668         return 0;
5669     }
5670     shiftCount = 0x403E - aExp;
5671     savedASig = aSig;
5672     aSig >>= shiftCount;
5673     z = aSig;
5674     if ( aSign ) z = - z;
5675     if ( ( z < 0 ) ^ aSign ) {
5676  invalid:
5677         float_raise(float_flag_invalid, status);
5678         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5679     }
5680     if ( ( aSig<<shiftCount ) != savedASig ) {
5681         float_raise(float_flag_inexact, status);
5682     }
5683     return z;
5684 
5685 }
5686 
5687 /*----------------------------------------------------------------------------
5688 | Returns the result of converting the extended double-precision floating-
5689 | point value `a' to the 64-bit two's complement integer format.  The
5690 | conversion is performed according to the IEC/IEEE Standard for Binary
5691 | Floating-Point Arithmetic---which means in particular that the conversion
5692 | is rounded according to the current rounding mode.  If `a' is a NaN,
5693 | the largest positive integer is returned.  Otherwise, if the conversion
5694 | overflows, the largest integer with the same sign as `a' is returned.
5695 *----------------------------------------------------------------------------*/
5696 
5697 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5698 {
5699     bool aSign;
5700     int32_t aExp, shiftCount;
5701     uint64_t aSig, aSigExtra;
5702 
5703     if (floatx80_invalid_encoding(a)) {
5704         float_raise(float_flag_invalid, status);
5705         return 1ULL << 63;
5706     }
5707     aSig = extractFloatx80Frac( a );
5708     aExp = extractFloatx80Exp( a );
5709     aSign = extractFloatx80Sign( a );
5710     shiftCount = 0x403E - aExp;
5711     if ( shiftCount <= 0 ) {
5712         if ( shiftCount ) {
5713             float_raise(float_flag_invalid, status);
5714             if (!aSign || floatx80_is_any_nan(a)) {
5715                 return INT64_MAX;
5716             }
5717             return INT64_MIN;
5718         }
5719         aSigExtra = 0;
5720     }
5721     else {
5722         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5723     }
5724     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5725 
5726 }
5727 
5728 /*----------------------------------------------------------------------------
5729 | Returns the result of converting the extended double-precision floating-
5730 | point value `a' to the 64-bit two's complement integer format.  The
5731 | conversion is performed according to the IEC/IEEE Standard for Binary
5732 | Floating-Point Arithmetic, except that the conversion is always rounded
5733 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5734 | Otherwise, if the conversion overflows, the largest integer with the same
5735 | sign as `a' is returned.
5736 *----------------------------------------------------------------------------*/
5737 
5738 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5739 {
5740     bool aSign;
5741     int32_t aExp, shiftCount;
5742     uint64_t aSig;
5743     int64_t z;
5744 
5745     if (floatx80_invalid_encoding(a)) {
5746         float_raise(float_flag_invalid, status);
5747         return 1ULL << 63;
5748     }
5749     aSig = extractFloatx80Frac( a );
5750     aExp = extractFloatx80Exp( a );
5751     aSign = extractFloatx80Sign( a );
5752     shiftCount = aExp - 0x403E;
5753     if ( 0 <= shiftCount ) {
5754         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5755         if ( ( a.high != 0xC03E ) || aSig ) {
5756             float_raise(float_flag_invalid, status);
5757             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5758                 return INT64_MAX;
5759             }
5760         }
5761         return INT64_MIN;
5762     }
5763     else if ( aExp < 0x3FFF ) {
5764         if (aExp | aSig) {
5765             float_raise(float_flag_inexact, status);
5766         }
5767         return 0;
5768     }
5769     z = aSig>>( - shiftCount );
5770     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5771         float_raise(float_flag_inexact, status);
5772     }
5773     if ( aSign ) z = - z;
5774     return z;
5775 
5776 }
5777 
5778 /*----------------------------------------------------------------------------
5779 | Returns the result of converting the extended double-precision floating-
5780 | point value `a' to the single-precision floating-point format.  The
5781 | conversion is performed according to the IEC/IEEE Standard for Binary
5782 | Floating-Point Arithmetic.
5783 *----------------------------------------------------------------------------*/
5784 
5785 float32 floatx80_to_float32(floatx80 a, float_status *status)
5786 {
5787     bool aSign;
5788     int32_t aExp;
5789     uint64_t aSig;
5790 
5791     if (floatx80_invalid_encoding(a)) {
5792         float_raise(float_flag_invalid, status);
5793         return float32_default_nan(status);
5794     }
5795     aSig = extractFloatx80Frac( a );
5796     aExp = extractFloatx80Exp( a );
5797     aSign = extractFloatx80Sign( a );
5798     if ( aExp == 0x7FFF ) {
5799         if ( (uint64_t) ( aSig<<1 ) ) {
5800             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5801                                              status);
5802             return float32_silence_nan(res, status);
5803         }
5804         return packFloat32( aSign, 0xFF, 0 );
5805     }
5806     shift64RightJamming( aSig, 33, &aSig );
5807     if ( aExp || aSig ) aExp -= 0x3F81;
5808     return roundAndPackFloat32(aSign, aExp, aSig, status);
5809 
5810 }
5811 
5812 /*----------------------------------------------------------------------------
5813 | Returns the result of converting the extended double-precision floating-
5814 | point value `a' to the double-precision floating-point format.  The
5815 | conversion is performed according to the IEC/IEEE Standard for Binary
5816 | Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5818 
5819 float64 floatx80_to_float64(floatx80 a, float_status *status)
5820 {
5821     bool aSign;
5822     int32_t aExp;
5823     uint64_t aSig, zSig;
5824 
5825     if (floatx80_invalid_encoding(a)) {
5826         float_raise(float_flag_invalid, status);
5827         return float64_default_nan(status);
5828     }
5829     aSig = extractFloatx80Frac( a );
5830     aExp = extractFloatx80Exp( a );
5831     aSign = extractFloatx80Sign( a );
5832     if ( aExp == 0x7FFF ) {
5833         if ( (uint64_t) ( aSig<<1 ) ) {
5834             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5835                                              status);
5836             return float64_silence_nan(res, status);
5837         }
5838         return packFloat64( aSign, 0x7FF, 0 );
5839     }
5840     shift64RightJamming( aSig, 1, &zSig );
5841     if ( aExp || aSig ) aExp -= 0x3C01;
5842     return roundAndPackFloat64(aSign, aExp, zSig, status);
5843 
5844 }
5845 
5846 /*----------------------------------------------------------------------------
5847 | Returns the result of converting the extended double-precision floating-
5848 | point value `a' to the quadruple-precision floating-point format.  The
5849 | conversion is performed according to the IEC/IEEE Standard for Binary
5850 | Floating-Point Arithmetic.
5851 *----------------------------------------------------------------------------*/
5852 
5853 float128 floatx80_to_float128(floatx80 a, float_status *status)
5854 {
5855     bool aSign;
5856     int aExp;
5857     uint64_t aSig, zSig0, zSig1;
5858 
5859     if (floatx80_invalid_encoding(a)) {
5860         float_raise(float_flag_invalid, status);
5861         return float128_default_nan(status);
5862     }
5863     aSig = extractFloatx80Frac( a );
5864     aExp = extractFloatx80Exp( a );
5865     aSign = extractFloatx80Sign( a );
5866     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5867         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5868                                            status);
5869         return float128_silence_nan(res, status);
5870     }
5871     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5872     return packFloat128( aSign, aExp, zSig0, zSig1 );
5873 
5874 }
5875 
5876 /*----------------------------------------------------------------------------
5877 | Rounds the extended double-precision floating-point value `a'
5878 | to the precision provided by floatx80_rounding_precision and returns the
5879 | result as an extended double-precision floating-point value.
5880 | The operation is performed according to the IEC/IEEE Standard for Binary
5881 | Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5883 
5884 floatx80 floatx80_round(floatx80 a, float_status *status)
5885 {
5886     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5887                                 extractFloatx80Sign(a),
5888                                 extractFloatx80Exp(a),
5889                                 extractFloatx80Frac(a), 0, status);
5890 }
5891 
5892 /*----------------------------------------------------------------------------
5893 | Rounds the extended double-precision floating-point value `a' to an integer,
5894 | and returns the result as an extended quadruple-precision floating-point
5895 | value.  The operation is performed according to the IEC/IEEE Standard for
5896 | Binary Floating-Point Arithmetic.
5897 *----------------------------------------------------------------------------*/
5898 
5899 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5900 {
5901     bool aSign;
5902     int32_t aExp;
5903     uint64_t lastBitMask, roundBitsMask;
5904     floatx80 z;
5905 
5906     if (floatx80_invalid_encoding(a)) {
5907         float_raise(float_flag_invalid, status);
5908         return floatx80_default_nan(status);
5909     }
5910     aExp = extractFloatx80Exp( a );
5911     if ( 0x403E <= aExp ) {
5912         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5913             return propagateFloatx80NaN(a, a, status);
5914         }
5915         return a;
5916     }
5917     if ( aExp < 0x3FFF ) {
5918         if (    ( aExp == 0 )
5919              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5920             return a;
5921         }
5922         float_raise(float_flag_inexact, status);
5923         aSign = extractFloatx80Sign( a );
5924         switch (status->float_rounding_mode) {
5925          case float_round_nearest_even:
5926             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5927                ) {
5928                 return
5929                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5930             }
5931             break;
5932         case float_round_ties_away:
5933             if (aExp == 0x3FFE) {
5934                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5935             }
5936             break;
5937          case float_round_down:
5938             return
5939                   aSign ?
5940                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5941                 : packFloatx80( 0, 0, 0 );
5942          case float_round_up:
5943             return
5944                   aSign ? packFloatx80( 1, 0, 0 )
5945                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5946 
5947         case float_round_to_zero:
5948             break;
5949         default:
5950             g_assert_not_reached();
5951         }
5952         return packFloatx80( aSign, 0, 0 );
5953     }
5954     lastBitMask = 1;
5955     lastBitMask <<= 0x403E - aExp;
5956     roundBitsMask = lastBitMask - 1;
5957     z = a;
5958     switch (status->float_rounding_mode) {
5959     case float_round_nearest_even:
5960         z.low += lastBitMask>>1;
5961         if ((z.low & roundBitsMask) == 0) {
5962             z.low &= ~lastBitMask;
5963         }
5964         break;
5965     case float_round_ties_away:
5966         z.low += lastBitMask >> 1;
5967         break;
5968     case float_round_to_zero:
5969         break;
5970     case float_round_up:
5971         if (!extractFloatx80Sign(z)) {
5972             z.low += roundBitsMask;
5973         }
5974         break;
5975     case float_round_down:
5976         if (extractFloatx80Sign(z)) {
5977             z.low += roundBitsMask;
5978         }
5979         break;
5980     default:
5981         abort();
5982     }
5983     z.low &= ~ roundBitsMask;
5984     if ( z.low == 0 ) {
5985         ++z.high;
5986         z.low = UINT64_C(0x8000000000000000);
5987     }
5988     if (z.low != a.low) {
5989         float_raise(float_flag_inexact, status);
5990     }
5991     return z;
5992 
5993 }
5994 
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of adding the absolute values of the extended double-
5997 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5998 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5999 | The addition is performed according to the IEC/IEEE Standard for Binary
6000 | Floating-Point Arithmetic.
6001 *----------------------------------------------------------------------------*/
6002 
6003 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6004                                 float_status *status)
6005 {
6006     int32_t aExp, bExp, zExp;
6007     uint64_t aSig, bSig, zSig0, zSig1;
6008     int32_t expDiff;
6009 
6010     aSig = extractFloatx80Frac( a );
6011     aExp = extractFloatx80Exp( a );
6012     bSig = extractFloatx80Frac( b );
6013     bExp = extractFloatx80Exp( b );
6014     expDiff = aExp - bExp;
6015     if ( 0 < expDiff ) {
6016         if ( aExp == 0x7FFF ) {
6017             if ((uint64_t)(aSig << 1)) {
6018                 return propagateFloatx80NaN(a, b, status);
6019             }
6020             return a;
6021         }
6022         if ( bExp == 0 ) --expDiff;
6023         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6024         zExp = aExp;
6025     }
6026     else if ( expDiff < 0 ) {
6027         if ( bExp == 0x7FFF ) {
6028             if ((uint64_t)(bSig << 1)) {
6029                 return propagateFloatx80NaN(a, b, status);
6030             }
6031             return packFloatx80(zSign,
6032                                 floatx80_infinity_high,
6033                                 floatx80_infinity_low);
6034         }
6035         if ( aExp == 0 ) ++expDiff;
6036         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6037         zExp = bExp;
6038     }
6039     else {
6040         if ( aExp == 0x7FFF ) {
6041             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6042                 return propagateFloatx80NaN(a, b, status);
6043             }
6044             return a;
6045         }
6046         zSig1 = 0;
6047         zSig0 = aSig + bSig;
6048         if ( aExp == 0 ) {
6049             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
6050                 /* At least one of the values is a pseudo-denormal,
6051                  * and there is a carry out of the result.  */
6052                 zExp = 1;
6053                 goto shiftRight1;
6054             }
6055             if (zSig0 == 0) {
6056                 return packFloatx80(zSign, 0, 0);
6057             }
6058             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
6059             goto roundAndPack;
6060         }
6061         zExp = aExp;
6062         goto shiftRight1;
6063     }
6064     zSig0 = aSig + bSig;
6065     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
6066  shiftRight1:
6067     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
6068     zSig0 |= UINT64_C(0x8000000000000000);
6069     ++zExp;
6070  roundAndPack:
6071     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6072                                 zSign, zExp, zSig0, zSig1, status);
6073 }
6074 
6075 /*----------------------------------------------------------------------------
6076 | Returns the result of subtracting the absolute values of the extended
6077 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
6078 | difference is negated before being returned.  `zSign' is ignored if the
6079 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6080 | Standard for Binary Floating-Point Arithmetic.
6081 *----------------------------------------------------------------------------*/
6082 
6083 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
6084                                 float_status *status)
6085 {
6086     int32_t aExp, bExp, zExp;
6087     uint64_t aSig, bSig, zSig0, zSig1;
6088     int32_t expDiff;
6089 
6090     aSig = extractFloatx80Frac( a );
6091     aExp = extractFloatx80Exp( a );
6092     bSig = extractFloatx80Frac( b );
6093     bExp = extractFloatx80Exp( b );
6094     expDiff = aExp - bExp;
6095     if ( 0 < expDiff ) goto aExpBigger;
6096     if ( expDiff < 0 ) goto bExpBigger;
6097     if ( aExp == 0x7FFF ) {
6098         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
6099             return propagateFloatx80NaN(a, b, status);
6100         }
6101         float_raise(float_flag_invalid, status);
6102         return floatx80_default_nan(status);
6103     }
6104     if ( aExp == 0 ) {
6105         aExp = 1;
6106         bExp = 1;
6107     }
6108     zSig1 = 0;
6109     if ( bSig < aSig ) goto aBigger;
6110     if ( aSig < bSig ) goto bBigger;
6111     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
6112  bExpBigger:
6113     if ( bExp == 0x7FFF ) {
6114         if ((uint64_t)(bSig << 1)) {
6115             return propagateFloatx80NaN(a, b, status);
6116         }
6117         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
6118                             floatx80_infinity_low);
6119     }
6120     if ( aExp == 0 ) ++expDiff;
6121     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
6122  bBigger:
6123     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
6124     zExp = bExp;
6125     zSign ^= 1;
6126     goto normalizeRoundAndPack;
6127  aExpBigger:
6128     if ( aExp == 0x7FFF ) {
6129         if ((uint64_t)(aSig << 1)) {
6130             return propagateFloatx80NaN(a, b, status);
6131         }
6132         return a;
6133     }
6134     if ( bExp == 0 ) --expDiff;
6135     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
6136  aBigger:
6137     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
6138     zExp = aExp;
6139  normalizeRoundAndPack:
6140     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6141                                          zSign, zExp, zSig0, zSig1, status);
6142 }
6143 
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of adding the extended double-precision floating-point
6146 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6147 | Standard for Binary Floating-Point Arithmetic.
6148 *----------------------------------------------------------------------------*/
6149 
6150 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
6151 {
6152     bool aSign, bSign;
6153 
6154     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6155         float_raise(float_flag_invalid, status);
6156         return floatx80_default_nan(status);
6157     }
6158     aSign = extractFloatx80Sign( a );
6159     bSign = extractFloatx80Sign( b );
6160     if ( aSign == bSign ) {
6161         return addFloatx80Sigs(a, b, aSign, status);
6162     }
6163     else {
6164         return subFloatx80Sigs(a, b, aSign, status);
6165     }
6166 
6167 }
6168 
6169 /*----------------------------------------------------------------------------
6170 | Returns the result of subtracting the extended double-precision floating-
6171 | point values `a' and `b'.  The operation is performed according to the
6172 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6173 *----------------------------------------------------------------------------*/
6174 
6175 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
6176 {
6177     bool aSign, bSign;
6178 
6179     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6180         float_raise(float_flag_invalid, status);
6181         return floatx80_default_nan(status);
6182     }
6183     aSign = extractFloatx80Sign( a );
6184     bSign = extractFloatx80Sign( b );
6185     if ( aSign == bSign ) {
6186         return subFloatx80Sigs(a, b, aSign, status);
6187     }
6188     else {
6189         return addFloatx80Sigs(a, b, aSign, status);
6190     }
6191 
6192 }
6193 
6194 /*----------------------------------------------------------------------------
6195 | Returns the result of multiplying the extended double-precision floating-
6196 | point values `a' and `b'.  The operation is performed according to the
6197 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6198 *----------------------------------------------------------------------------*/
6199 
6200 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6201 {
6202     bool aSign, bSign, zSign;
6203     int32_t aExp, bExp, zExp;
6204     uint64_t aSig, bSig, zSig0, zSig1;
6205 
6206     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6207         float_raise(float_flag_invalid, status);
6208         return floatx80_default_nan(status);
6209     }
6210     aSig = extractFloatx80Frac( a );
6211     aExp = extractFloatx80Exp( a );
6212     aSign = extractFloatx80Sign( a );
6213     bSig = extractFloatx80Frac( b );
6214     bExp = extractFloatx80Exp( b );
6215     bSign = extractFloatx80Sign( b );
6216     zSign = aSign ^ bSign;
6217     if ( aExp == 0x7FFF ) {
6218         if (    (uint64_t) ( aSig<<1 )
6219              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6220             return propagateFloatx80NaN(a, b, status);
6221         }
6222         if ( ( bExp | bSig ) == 0 ) goto invalid;
6223         return packFloatx80(zSign, floatx80_infinity_high,
6224                                    floatx80_infinity_low);
6225     }
6226     if ( bExp == 0x7FFF ) {
6227         if ((uint64_t)(bSig << 1)) {
6228             return propagateFloatx80NaN(a, b, status);
6229         }
6230         if ( ( aExp | aSig ) == 0 ) {
6231  invalid:
6232             float_raise(float_flag_invalid, status);
6233             return floatx80_default_nan(status);
6234         }
6235         return packFloatx80(zSign, floatx80_infinity_high,
6236                                    floatx80_infinity_low);
6237     }
6238     if ( aExp == 0 ) {
6239         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6240         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6241     }
6242     if ( bExp == 0 ) {
6243         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6244         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6245     }
6246     zExp = aExp + bExp - 0x3FFE;
6247     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6248     if ( 0 < (int64_t) zSig0 ) {
6249         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6250         --zExp;
6251     }
6252     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6253                                 zSign, zExp, zSig0, zSig1, status);
6254 }
6255 
6256 /*----------------------------------------------------------------------------
6257 | Returns the result of dividing the extended double-precision floating-point
6258 | value `a' by the corresponding value `b'.  The operation is performed
6259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6260 *----------------------------------------------------------------------------*/
6261 
6262 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6263 {
6264     bool aSign, bSign, zSign;
6265     int32_t aExp, bExp, zExp;
6266     uint64_t aSig, bSig, zSig0, zSig1;
6267     uint64_t rem0, rem1, rem2, term0, term1, term2;
6268 
6269     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6270         float_raise(float_flag_invalid, status);
6271         return floatx80_default_nan(status);
6272     }
6273     aSig = extractFloatx80Frac( a );
6274     aExp = extractFloatx80Exp( a );
6275     aSign = extractFloatx80Sign( a );
6276     bSig = extractFloatx80Frac( b );
6277     bExp = extractFloatx80Exp( b );
6278     bSign = extractFloatx80Sign( b );
6279     zSign = aSign ^ bSign;
6280     if ( aExp == 0x7FFF ) {
6281         if ((uint64_t)(aSig << 1)) {
6282             return propagateFloatx80NaN(a, b, status);
6283         }
6284         if ( bExp == 0x7FFF ) {
6285             if ((uint64_t)(bSig << 1)) {
6286                 return propagateFloatx80NaN(a, b, status);
6287             }
6288             goto invalid;
6289         }
6290         return packFloatx80(zSign, floatx80_infinity_high,
6291                                    floatx80_infinity_low);
6292     }
6293     if ( bExp == 0x7FFF ) {
6294         if ((uint64_t)(bSig << 1)) {
6295             return propagateFloatx80NaN(a, b, status);
6296         }
6297         return packFloatx80( zSign, 0, 0 );
6298     }
6299     if ( bExp == 0 ) {
6300         if ( bSig == 0 ) {
6301             if ( ( aExp | aSig ) == 0 ) {
6302  invalid:
6303                 float_raise(float_flag_invalid, status);
6304                 return floatx80_default_nan(status);
6305             }
6306             float_raise(float_flag_divbyzero, status);
6307             return packFloatx80(zSign, floatx80_infinity_high,
6308                                        floatx80_infinity_low);
6309         }
6310         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6311     }
6312     if ( aExp == 0 ) {
6313         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6314         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6315     }
6316     zExp = aExp - bExp + 0x3FFE;
6317     rem1 = 0;
6318     if ( bSig <= aSig ) {
6319         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6320         ++zExp;
6321     }
6322     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6323     mul64To128( bSig, zSig0, &term0, &term1 );
6324     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6325     while ( (int64_t) rem0 < 0 ) {
6326         --zSig0;
6327         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6328     }
6329     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6330     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6331         mul64To128( bSig, zSig1, &term1, &term2 );
6332         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6333         while ( (int64_t) rem1 < 0 ) {
6334             --zSig1;
6335             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6336         }
6337         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6338     }
6339     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6340                                 zSign, zExp, zSig0, zSig1, status);
6341 }
6342 
6343 /*----------------------------------------------------------------------------
6344 | Returns the remainder of the extended double-precision floating-point value
6345 | `a' with respect to the corresponding value `b'.  The operation is performed
6346 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6347 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6348 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6349 | the absolute value of the integer quotient.
6350 *----------------------------------------------------------------------------*/
6351 
6352 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6353                          float_status *status)
6354 {
6355     bool aSign, zSign;
6356     int32_t aExp, bExp, expDiff, aExpOrig;
6357     uint64_t aSig0, aSig1, bSig;
6358     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6359 
6360     *quotient = 0;
6361     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6362         float_raise(float_flag_invalid, status);
6363         return floatx80_default_nan(status);
6364     }
6365     aSig0 = extractFloatx80Frac( a );
6366     aExpOrig = aExp = extractFloatx80Exp( a );
6367     aSign = extractFloatx80Sign( a );
6368     bSig = extractFloatx80Frac( b );
6369     bExp = extractFloatx80Exp( b );
6370     if ( aExp == 0x7FFF ) {
6371         if (    (uint64_t) ( aSig0<<1 )
6372              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6373             return propagateFloatx80NaN(a, b, status);
6374         }
6375         goto invalid;
6376     }
6377     if ( bExp == 0x7FFF ) {
6378         if ((uint64_t)(bSig << 1)) {
6379             return propagateFloatx80NaN(a, b, status);
6380         }
6381         if (aExp == 0 && aSig0 >> 63) {
6382             /*
6383              * Pseudo-denormal argument must be returned in normalized
6384              * form.
6385              */
6386             return packFloatx80(aSign, 1, aSig0);
6387         }
6388         return a;
6389     }
6390     if ( bExp == 0 ) {
6391         if ( bSig == 0 ) {
6392  invalid:
6393             float_raise(float_flag_invalid, status);
6394             return floatx80_default_nan(status);
6395         }
6396         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6397     }
6398     if ( aExp == 0 ) {
6399         if ( aSig0 == 0 ) return a;
6400         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6401     }
6402     zSign = aSign;
6403     expDiff = aExp - bExp;
6404     aSig1 = 0;
6405     if ( expDiff < 0 ) {
6406         if ( mod || expDiff < -1 ) {
6407             if (aExp == 1 && aExpOrig == 0) {
6408                 /*
6409                  * Pseudo-denormal argument must be returned in
6410                  * normalized form.
6411                  */
6412                 return packFloatx80(aSign, aExp, aSig0);
6413             }
6414             return a;
6415         }
6416         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6417         expDiff = 0;
6418     }
6419     *quotient = q = ( bSig <= aSig0 );
6420     if ( q ) aSig0 -= bSig;
6421     expDiff -= 64;
6422     while ( 0 < expDiff ) {
6423         q = estimateDiv128To64( aSig0, aSig1, bSig );
6424         q = ( 2 < q ) ? q - 2 : 0;
6425         mul64To128( bSig, q, &term0, &term1 );
6426         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6427         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6428         expDiff -= 62;
6429         *quotient <<= 62;
6430         *quotient += q;
6431     }
6432     expDiff += 64;
6433     if ( 0 < expDiff ) {
6434         q = estimateDiv128To64( aSig0, aSig1, bSig );
6435         q = ( 2 < q ) ? q - 2 : 0;
6436         q >>= 64 - expDiff;
6437         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6438         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6439         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6440         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6441             ++q;
6442             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6443         }
6444         if (expDiff < 64) {
6445             *quotient <<= expDiff;
6446         } else {
6447             *quotient = 0;
6448         }
6449         *quotient += q;
6450     }
6451     else {
6452         term1 = 0;
6453         term0 = bSig;
6454     }
6455     if (!mod) {
6456         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6457         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6458                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6459                         && ( q & 1 ) )
6460             ) {
6461             aSig0 = alternateASig0;
6462             aSig1 = alternateASig1;
6463             zSign = ! zSign;
6464             ++*quotient;
6465         }
6466     }
6467     return
6468         normalizeRoundAndPackFloatx80(
6469             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6470 
6471 }
6472 
6473 /*----------------------------------------------------------------------------
6474 | Returns the remainder of the extended double-precision floating-point value
6475 | `a' with respect to the corresponding value `b'.  The operation is performed
6476 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6477 *----------------------------------------------------------------------------*/
6478 
6479 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6480 {
6481     uint64_t quotient;
6482     return floatx80_modrem(a, b, false, &quotient, status);
6483 }
6484 
6485 /*----------------------------------------------------------------------------
6486 | Returns the remainder of the extended double-precision floating-point value
6487 | `a' with respect to the corresponding value `b', with the quotient truncated
6488 | toward zero.
6489 *----------------------------------------------------------------------------*/
6490 
6491 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6492 {
6493     uint64_t quotient;
6494     return floatx80_modrem(a, b, true, &quotient, status);
6495 }
6496 
6497 /*----------------------------------------------------------------------------
6498 | Returns the square root of the extended double-precision floating-point
6499 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6500 | for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6502 
6503 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6504 {
6505     bool aSign;
6506     int32_t aExp, zExp;
6507     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6508     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6509 
6510     if (floatx80_invalid_encoding(a)) {
6511         float_raise(float_flag_invalid, status);
6512         return floatx80_default_nan(status);
6513     }
6514     aSig0 = extractFloatx80Frac( a );
6515     aExp = extractFloatx80Exp( a );
6516     aSign = extractFloatx80Sign( a );
6517     if ( aExp == 0x7FFF ) {
6518         if ((uint64_t)(aSig0 << 1)) {
6519             return propagateFloatx80NaN(a, a, status);
6520         }
6521         if ( ! aSign ) return a;
6522         goto invalid;
6523     }
6524     if ( aSign ) {
6525         if ( ( aExp | aSig0 ) == 0 ) return a;
6526  invalid:
6527         float_raise(float_flag_invalid, status);
6528         return floatx80_default_nan(status);
6529     }
6530     if ( aExp == 0 ) {
6531         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6532         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6533     }
6534     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6535     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6536     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6537     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6538     doubleZSig0 = zSig0<<1;
6539     mul64To128( zSig0, zSig0, &term0, &term1 );
6540     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6541     while ( (int64_t) rem0 < 0 ) {
6542         --zSig0;
6543         doubleZSig0 -= 2;
6544         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6545     }
6546     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6547     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6548         if ( zSig1 == 0 ) zSig1 = 1;
6549         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6550         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6551         mul64To128( zSig1, zSig1, &term2, &term3 );
6552         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6553         while ( (int64_t) rem1 < 0 ) {
6554             --zSig1;
6555             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6556             term3 |= 1;
6557             term2 |= doubleZSig0;
6558             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6559         }
6560         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6561     }
6562     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6563     zSig0 |= doubleZSig0;
6564     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6565                                 0, zExp, zSig0, zSig1, status);
6566 }
6567 
6568 /*----------------------------------------------------------------------------
6569 | Returns the result of converting the quadruple-precision floating-point
6570 | value `a' to the 32-bit two's complement integer format.  The conversion
6571 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6572 | Arithmetic---which means in particular that the conversion is rounded
6573 | according to the current rounding mode.  If `a' is a NaN, the largest
6574 | positive integer is returned.  Otherwise, if the conversion overflows, the
6575 | largest integer with the same sign as `a' is returned.
6576 *----------------------------------------------------------------------------*/
6577 
6578 int32_t float128_to_int32(float128 a, float_status *status)
6579 {
6580     bool aSign;
6581     int32_t aExp, shiftCount;
6582     uint64_t aSig0, aSig1;
6583 
6584     aSig1 = extractFloat128Frac1( a );
6585     aSig0 = extractFloat128Frac0( a );
6586     aExp = extractFloat128Exp( a );
6587     aSign = extractFloat128Sign( a );
6588     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6589     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6590     aSig0 |= ( aSig1 != 0 );
6591     shiftCount = 0x4028 - aExp;
6592     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6593     return roundAndPackInt32(aSign, aSig0, status);
6594 
6595 }
6596 
6597 /*----------------------------------------------------------------------------
6598 | Returns the result of converting the quadruple-precision floating-point
6599 | value `a' to the 32-bit two's complement integer format.  The conversion
6600 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6601 | Arithmetic, except that the conversion is always rounded toward zero.  If
6602 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6603 | conversion overflows, the largest integer with the same sign as `a' is
6604 | returned.
6605 *----------------------------------------------------------------------------*/
6606 
6607 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6608 {
6609     bool aSign;
6610     int32_t aExp, shiftCount;
6611     uint64_t aSig0, aSig1, savedASig;
6612     int32_t z;
6613 
6614     aSig1 = extractFloat128Frac1( a );
6615     aSig0 = extractFloat128Frac0( a );
6616     aExp = extractFloat128Exp( a );
6617     aSign = extractFloat128Sign( a );
6618     aSig0 |= ( aSig1 != 0 );
6619     if ( 0x401E < aExp ) {
6620         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6621         goto invalid;
6622     }
6623     else if ( aExp < 0x3FFF ) {
6624         if (aExp || aSig0) {
6625             float_raise(float_flag_inexact, status);
6626         }
6627         return 0;
6628     }
6629     aSig0 |= UINT64_C(0x0001000000000000);
6630     shiftCount = 0x402F - aExp;
6631     savedASig = aSig0;
6632     aSig0 >>= shiftCount;
6633     z = aSig0;
6634     if ( aSign ) z = - z;
6635     if ( ( z < 0 ) ^ aSign ) {
6636  invalid:
6637         float_raise(float_flag_invalid, status);
6638         return aSign ? INT32_MIN : INT32_MAX;
6639     }
6640     if ( ( aSig0<<shiftCount ) != savedASig ) {
6641         float_raise(float_flag_inexact, status);
6642     }
6643     return z;
6644 
6645 }
6646 
6647 /*----------------------------------------------------------------------------
6648 | Returns the result of converting the quadruple-precision floating-point
6649 | value `a' to the 64-bit two's complement integer format.  The conversion
6650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6651 | Arithmetic---which means in particular that the conversion is rounded
6652 | according to the current rounding mode.  If `a' is a NaN, the largest
6653 | positive integer is returned.  Otherwise, if the conversion overflows, the
6654 | largest integer with the same sign as `a' is returned.
6655 *----------------------------------------------------------------------------*/
6656 
6657 int64_t float128_to_int64(float128 a, float_status *status)
6658 {
6659     bool aSign;
6660     int32_t aExp, shiftCount;
6661     uint64_t aSig0, aSig1;
6662 
6663     aSig1 = extractFloat128Frac1( a );
6664     aSig0 = extractFloat128Frac0( a );
6665     aExp = extractFloat128Exp( a );
6666     aSign = extractFloat128Sign( a );
6667     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6668     shiftCount = 0x402F - aExp;
6669     if ( shiftCount <= 0 ) {
6670         if ( 0x403E < aExp ) {
6671             float_raise(float_flag_invalid, status);
6672             if (    ! aSign
6673                  || (    ( aExp == 0x7FFF )
6674                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6675                     )
6676                ) {
6677                 return INT64_MAX;
6678             }
6679             return INT64_MIN;
6680         }
6681         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6682     }
6683     else {
6684         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6685     }
6686     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6687 
6688 }
6689 
6690 /*----------------------------------------------------------------------------
6691 | Returns the result of converting the quadruple-precision floating-point
6692 | value `a' to the 64-bit two's complement integer format.  The conversion
6693 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6694 | Arithmetic, except that the conversion is always rounded toward zero.
6695 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6696 | the conversion overflows, the largest integer with the same sign as `a' is
6697 | returned.
6698 *----------------------------------------------------------------------------*/
6699 
6700 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6701 {
6702     bool aSign;
6703     int32_t aExp, shiftCount;
6704     uint64_t aSig0, aSig1;
6705     int64_t z;
6706 
6707     aSig1 = extractFloat128Frac1( a );
6708     aSig0 = extractFloat128Frac0( a );
6709     aExp = extractFloat128Exp( a );
6710     aSign = extractFloat128Sign( a );
6711     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6712     shiftCount = aExp - 0x402F;
6713     if ( 0 < shiftCount ) {
6714         if ( 0x403E <= aExp ) {
6715             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6716             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6717                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6718                 if (aSig1) {
6719                     float_raise(float_flag_inexact, status);
6720                 }
6721             }
6722             else {
6723                 float_raise(float_flag_invalid, status);
6724                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6725                     return INT64_MAX;
6726                 }
6727             }
6728             return INT64_MIN;
6729         }
6730         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6731         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6732             float_raise(float_flag_inexact, status);
6733         }
6734     }
6735     else {
6736         if ( aExp < 0x3FFF ) {
6737             if ( aExp | aSig0 | aSig1 ) {
6738                 float_raise(float_flag_inexact, status);
6739             }
6740             return 0;
6741         }
6742         z = aSig0>>( - shiftCount );
6743         if (    aSig1
6744              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6745             float_raise(float_flag_inexact, status);
6746         }
6747     }
6748     if ( aSign ) z = - z;
6749     return z;
6750 
6751 }
6752 
6753 /*----------------------------------------------------------------------------
6754 | Returns the result of converting the quadruple-precision floating-point value
6755 | `a' to the 64-bit unsigned integer format.  The conversion is
6756 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6757 | Arithmetic---which means in particular that the conversion is rounded
6758 | according to the current rounding mode.  If `a' is a NaN, the largest
6759 | positive integer is returned.  If the conversion overflows, the
6760 | largest unsigned integer is returned.  If 'a' is negative, the value is
6761 | rounded and zero is returned; negative values that do not round to zero
6762 | will raise the inexact exception.
6763 *----------------------------------------------------------------------------*/
6764 
6765 uint64_t float128_to_uint64(float128 a, float_status *status)
6766 {
6767     bool aSign;
6768     int aExp;
6769     int shiftCount;
6770     uint64_t aSig0, aSig1;
6771 
6772     aSig0 = extractFloat128Frac0(a);
6773     aSig1 = extractFloat128Frac1(a);
6774     aExp = extractFloat128Exp(a);
6775     aSign = extractFloat128Sign(a);
6776     if (aSign && (aExp > 0x3FFE)) {
6777         float_raise(float_flag_invalid, status);
6778         if (float128_is_any_nan(a)) {
6779             return UINT64_MAX;
6780         } else {
6781             return 0;
6782         }
6783     }
6784     if (aExp) {
6785         aSig0 |= UINT64_C(0x0001000000000000);
6786     }
6787     shiftCount = 0x402F - aExp;
6788     if (shiftCount <= 0) {
6789         if (0x403E < aExp) {
6790             float_raise(float_flag_invalid, status);
6791             return UINT64_MAX;
6792         }
6793         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6794     } else {
6795         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6796     }
6797     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6798 }
6799 
6800 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6801 {
6802     uint64_t v;
6803     signed char current_rounding_mode = status->float_rounding_mode;
6804 
6805     set_float_rounding_mode(float_round_to_zero, status);
6806     v = float128_to_uint64(a, status);
6807     set_float_rounding_mode(current_rounding_mode, status);
6808 
6809     return v;
6810 }
6811 
6812 /*----------------------------------------------------------------------------
6813 | Returns the result of converting the quadruple-precision floating-point
6814 | value `a' to the 32-bit unsigned integer format.  The conversion
6815 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6816 | Arithmetic except that the conversion is always rounded toward zero.
6817 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6818 | if the conversion overflows, the largest unsigned integer is returned.
6819 | If 'a' is negative, the value is rounded and zero is returned; negative
6820 | values that do not round to zero will raise the inexact exception.
6821 *----------------------------------------------------------------------------*/
6822 
6823 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6824 {
6825     uint64_t v;
6826     uint32_t res;
6827     int old_exc_flags = get_float_exception_flags(status);
6828 
6829     v = float128_to_uint64_round_to_zero(a, status);
6830     if (v > 0xffffffff) {
6831         res = 0xffffffff;
6832     } else {
6833         return v;
6834     }
6835     set_float_exception_flags(old_exc_flags, status);
6836     float_raise(float_flag_invalid, status);
6837     return res;
6838 }
6839 
6840 /*----------------------------------------------------------------------------
6841 | Returns the result of converting the quadruple-precision floating-point value
6842 | `a' to the 32-bit unsigned integer format.  The conversion is
6843 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6844 | Arithmetic---which means in particular that the conversion is rounded
6845 | according to the current rounding mode.  If `a' is a NaN, the largest
6846 | positive integer is returned.  If the conversion overflows, the
6847 | largest unsigned integer is returned.  If 'a' is negative, the value is
6848 | rounded and zero is returned; negative values that do not round to zero
6849 | will raise the inexact exception.
6850 *----------------------------------------------------------------------------*/
6851 
6852 uint32_t float128_to_uint32(float128 a, float_status *status)
6853 {
6854     uint64_t v;
6855     uint32_t res;
6856     int old_exc_flags = get_float_exception_flags(status);
6857 
6858     v = float128_to_uint64(a, status);
6859     if (v > 0xffffffff) {
6860         res = 0xffffffff;
6861     } else {
6862         return v;
6863     }
6864     set_float_exception_flags(old_exc_flags, status);
6865     float_raise(float_flag_invalid, status);
6866     return res;
6867 }
6868 
6869 /*----------------------------------------------------------------------------
6870 | Returns the result of converting the quadruple-precision floating-point
6871 | value `a' to the single-precision floating-point format.  The conversion
6872 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6873 | Arithmetic.
6874 *----------------------------------------------------------------------------*/
6875 
6876 float32 float128_to_float32(float128 a, float_status *status)
6877 {
6878     bool aSign;
6879     int32_t aExp;
6880     uint64_t aSig0, aSig1;
6881     uint32_t zSig;
6882 
6883     aSig1 = extractFloat128Frac1( a );
6884     aSig0 = extractFloat128Frac0( a );
6885     aExp = extractFloat128Exp( a );
6886     aSign = extractFloat128Sign( a );
6887     if ( aExp == 0x7FFF ) {
6888         if ( aSig0 | aSig1 ) {
6889             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6890         }
6891         return packFloat32( aSign, 0xFF, 0 );
6892     }
6893     aSig0 |= ( aSig1 != 0 );
6894     shift64RightJamming( aSig0, 18, &aSig0 );
6895     zSig = aSig0;
6896     if ( aExp || zSig ) {
6897         zSig |= 0x40000000;
6898         aExp -= 0x3F81;
6899     }
6900     return roundAndPackFloat32(aSign, aExp, zSig, status);
6901 
6902 }
6903 
6904 /*----------------------------------------------------------------------------
6905 | Returns the result of converting the quadruple-precision floating-point
6906 | value `a' to the double-precision floating-point format.  The conversion
6907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6908 | Arithmetic.
6909 *----------------------------------------------------------------------------*/
6910 
6911 float64 float128_to_float64(float128 a, float_status *status)
6912 {
6913     bool aSign;
6914     int32_t aExp;
6915     uint64_t aSig0, aSig1;
6916 
6917     aSig1 = extractFloat128Frac1( a );
6918     aSig0 = extractFloat128Frac0( a );
6919     aExp = extractFloat128Exp( a );
6920     aSign = extractFloat128Sign( a );
6921     if ( aExp == 0x7FFF ) {
6922         if ( aSig0 | aSig1 ) {
6923             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6924         }
6925         return packFloat64( aSign, 0x7FF, 0 );
6926     }
6927     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6928     aSig0 |= ( aSig1 != 0 );
6929     if ( aExp || aSig0 ) {
6930         aSig0 |= UINT64_C(0x4000000000000000);
6931         aExp -= 0x3C01;
6932     }
6933     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6934 
6935 }
6936 
6937 /*----------------------------------------------------------------------------
6938 | Returns the result of converting the quadruple-precision floating-point
6939 | value `a' to the extended double-precision floating-point format.  The
6940 | conversion is performed according to the IEC/IEEE Standard for Binary
6941 | Floating-Point Arithmetic.
6942 *----------------------------------------------------------------------------*/
6943 
6944 floatx80 float128_to_floatx80(float128 a, float_status *status)
6945 {
6946     bool aSign;
6947     int32_t aExp;
6948     uint64_t aSig0, aSig1;
6949 
6950     aSig1 = extractFloat128Frac1( a );
6951     aSig0 = extractFloat128Frac0( a );
6952     aExp = extractFloat128Exp( a );
6953     aSign = extractFloat128Sign( a );
6954     if ( aExp == 0x7FFF ) {
6955         if ( aSig0 | aSig1 ) {
6956             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6957                                                status);
6958             return floatx80_silence_nan(res, status);
6959         }
6960         return packFloatx80(aSign, floatx80_infinity_high,
6961                                    floatx80_infinity_low);
6962     }
6963     if ( aExp == 0 ) {
6964         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6965         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6966     }
6967     else {
6968         aSig0 |= UINT64_C(0x0001000000000000);
6969     }
6970     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6971     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6972 
6973 }
6974 
6975 /*----------------------------------------------------------------------------
6976 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6977 | returns the result as a quadruple-precision floating-point value.  The
6978 | operation is performed according to the IEC/IEEE Standard for Binary
6979 | Floating-Point Arithmetic.
6980 *----------------------------------------------------------------------------*/
6981 
6982 float128 float128_round_to_int(float128 a, float_status *status)
6983 {
6984     bool aSign;
6985     int32_t aExp;
6986     uint64_t lastBitMask, roundBitsMask;
6987     float128 z;
6988 
6989     aExp = extractFloat128Exp( a );
6990     if ( 0x402F <= aExp ) {
6991         if ( 0x406F <= aExp ) {
6992             if (    ( aExp == 0x7FFF )
6993                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6994                ) {
6995                 return propagateFloat128NaN(a, a, status);
6996             }
6997             return a;
6998         }
6999         lastBitMask = 1;
7000         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7001         roundBitsMask = lastBitMask - 1;
7002         z = a;
7003         switch (status->float_rounding_mode) {
7004         case float_round_nearest_even:
7005             if ( lastBitMask ) {
7006                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7007                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7008             }
7009             else {
7010                 if ( (int64_t) z.low < 0 ) {
7011                     ++z.high;
7012                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7013                 }
7014             }
7015             break;
7016         case float_round_ties_away:
7017             if (lastBitMask) {
7018                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7019             } else {
7020                 if ((int64_t) z.low < 0) {
7021                     ++z.high;
7022                 }
7023             }
7024             break;
7025         case float_round_to_zero:
7026             break;
7027         case float_round_up:
7028             if (!extractFloat128Sign(z)) {
7029                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7030             }
7031             break;
7032         case float_round_down:
7033             if (extractFloat128Sign(z)) {
7034                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7035             }
7036             break;
7037         case float_round_to_odd:
7038             /*
7039              * Note that if lastBitMask == 0, the last bit is the lsb
7040              * of high, and roundBitsMask == -1.
7041              */
7042             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7043                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7044             }
7045             break;
7046         default:
7047             abort();
7048         }
7049         z.low &= ~ roundBitsMask;
7050     }
7051     else {
7052         if ( aExp < 0x3FFF ) {
7053             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7054             float_raise(float_flag_inexact, status);
7055             aSign = extractFloat128Sign( a );
7056             switch (status->float_rounding_mode) {
7057             case float_round_nearest_even:
7058                 if (    ( aExp == 0x3FFE )
7059                      && (   extractFloat128Frac0( a )
7060                           | extractFloat128Frac1( a ) )
7061                    ) {
7062                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7063                 }
7064                 break;
7065             case float_round_ties_away:
7066                 if (aExp == 0x3FFE) {
7067                     return packFloat128(aSign, 0x3FFF, 0, 0);
7068                 }
7069                 break;
7070             case float_round_down:
7071                 return
7072                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7073                     : packFloat128( 0, 0, 0, 0 );
7074             case float_round_up:
7075                 return
7076                       aSign ? packFloat128( 1, 0, 0, 0 )
7077                     : packFloat128( 0, 0x3FFF, 0, 0 );
7078 
7079             case float_round_to_odd:
7080                 return packFloat128(aSign, 0x3FFF, 0, 0);
7081 
7082             case float_round_to_zero:
7083                 break;
7084             }
7085             return packFloat128( aSign, 0, 0, 0 );
7086         }
7087         lastBitMask = 1;
7088         lastBitMask <<= 0x402F - aExp;
7089         roundBitsMask = lastBitMask - 1;
7090         z.low = 0;
7091         z.high = a.high;
7092         switch (status->float_rounding_mode) {
7093         case float_round_nearest_even:
7094             z.high += lastBitMask>>1;
7095             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7096                 z.high &= ~ lastBitMask;
7097             }
7098             break;
7099         case float_round_ties_away:
7100             z.high += lastBitMask>>1;
7101             break;
7102         case float_round_to_zero:
7103             break;
7104         case float_round_up:
7105             if (!extractFloat128Sign(z)) {
7106                 z.high |= ( a.low != 0 );
7107                 z.high += roundBitsMask;
7108             }
7109             break;
7110         case float_round_down:
7111             if (extractFloat128Sign(z)) {
7112                 z.high |= (a.low != 0);
7113                 z.high += roundBitsMask;
7114             }
7115             break;
7116         case float_round_to_odd:
7117             if ((z.high & lastBitMask) == 0) {
7118                 z.high |= (a.low != 0);
7119                 z.high += roundBitsMask;
7120             }
7121             break;
7122         default:
7123             abort();
7124         }
7125         z.high &= ~ roundBitsMask;
7126     }
7127     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7128         float_raise(float_flag_inexact, status);
7129     }
7130     return z;
7131 
7132 }
7133 
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of adding the absolute values of the quadruple-precision
7136 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7137 | before being returned.  `zSign' is ignored if the result is a NaN.
7138 | The addition is performed according to the IEC/IEEE Standard for Binary
7139 | Floating-Point Arithmetic.
7140 *----------------------------------------------------------------------------*/
7141 
7142 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
7143                                 float_status *status)
7144 {
7145     int32_t aExp, bExp, zExp;
7146     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7147     int32_t expDiff;
7148 
7149     aSig1 = extractFloat128Frac1( a );
7150     aSig0 = extractFloat128Frac0( a );
7151     aExp = extractFloat128Exp( a );
7152     bSig1 = extractFloat128Frac1( b );
7153     bSig0 = extractFloat128Frac0( b );
7154     bExp = extractFloat128Exp( b );
7155     expDiff = aExp - bExp;
7156     if ( 0 < expDiff ) {
7157         if ( aExp == 0x7FFF ) {
7158             if (aSig0 | aSig1) {
7159                 return propagateFloat128NaN(a, b, status);
7160             }
7161             return a;
7162         }
7163         if ( bExp == 0 ) {
7164             --expDiff;
7165         }
7166         else {
7167             bSig0 |= UINT64_C(0x0001000000000000);
7168         }
7169         shift128ExtraRightJamming(
7170             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7171         zExp = aExp;
7172     }
7173     else if ( expDiff < 0 ) {
7174         if ( bExp == 0x7FFF ) {
7175             if (bSig0 | bSig1) {
7176                 return propagateFloat128NaN(a, b, status);
7177             }
7178             return packFloat128( zSign, 0x7FFF, 0, 0 );
7179         }
7180         if ( aExp == 0 ) {
7181             ++expDiff;
7182         }
7183         else {
7184             aSig0 |= UINT64_C(0x0001000000000000);
7185         }
7186         shift128ExtraRightJamming(
7187             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7188         zExp = bExp;
7189     }
7190     else {
7191         if ( aExp == 0x7FFF ) {
7192             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7193                 return propagateFloat128NaN(a, b, status);
7194             }
7195             return a;
7196         }
7197         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7198         if ( aExp == 0 ) {
7199             if (status->flush_to_zero) {
7200                 if (zSig0 | zSig1) {
7201                     float_raise(float_flag_output_denormal, status);
7202                 }
7203                 return packFloat128(zSign, 0, 0, 0);
7204             }
7205             return packFloat128( zSign, 0, zSig0, zSig1 );
7206         }
7207         zSig2 = 0;
7208         zSig0 |= UINT64_C(0x0002000000000000);
7209         zExp = aExp;
7210         goto shiftRight1;
7211     }
7212     aSig0 |= UINT64_C(0x0001000000000000);
7213     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7214     --zExp;
7215     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7216     ++zExp;
7217  shiftRight1:
7218     shift128ExtraRightJamming(
7219         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7220  roundAndPack:
7221     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7222 
7223 }
7224 
7225 /*----------------------------------------------------------------------------
7226 | Returns the result of subtracting the absolute values of the quadruple-
7227 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7228 | difference is negated before being returned.  `zSign' is ignored if the
7229 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7230 | Standard for Binary Floating-Point Arithmetic.
7231 *----------------------------------------------------------------------------*/
7232 
7233 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7234                                 float_status *status)
7235 {
7236     int32_t aExp, bExp, zExp;
7237     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7238     int32_t expDiff;
7239 
7240     aSig1 = extractFloat128Frac1( a );
7241     aSig0 = extractFloat128Frac0( a );
7242     aExp = extractFloat128Exp( a );
7243     bSig1 = extractFloat128Frac1( b );
7244     bSig0 = extractFloat128Frac0( b );
7245     bExp = extractFloat128Exp( b );
7246     expDiff = aExp - bExp;
7247     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7248     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7249     if ( 0 < expDiff ) goto aExpBigger;
7250     if ( expDiff < 0 ) goto bExpBigger;
7251     if ( aExp == 0x7FFF ) {
7252         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7253             return propagateFloat128NaN(a, b, status);
7254         }
7255         float_raise(float_flag_invalid, status);
7256         return float128_default_nan(status);
7257     }
7258     if ( aExp == 0 ) {
7259         aExp = 1;
7260         bExp = 1;
7261     }
7262     if ( bSig0 < aSig0 ) goto aBigger;
7263     if ( aSig0 < bSig0 ) goto bBigger;
7264     if ( bSig1 < aSig1 ) goto aBigger;
7265     if ( aSig1 < bSig1 ) goto bBigger;
7266     return packFloat128(status->float_rounding_mode == float_round_down,
7267                         0, 0, 0);
7268  bExpBigger:
7269     if ( bExp == 0x7FFF ) {
7270         if (bSig0 | bSig1) {
7271             return propagateFloat128NaN(a, b, status);
7272         }
7273         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7274     }
7275     if ( aExp == 0 ) {
7276         ++expDiff;
7277     }
7278     else {
7279         aSig0 |= UINT64_C(0x4000000000000000);
7280     }
7281     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7282     bSig0 |= UINT64_C(0x4000000000000000);
7283  bBigger:
7284     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7285     zExp = bExp;
7286     zSign ^= 1;
7287     goto normalizeRoundAndPack;
7288  aExpBigger:
7289     if ( aExp == 0x7FFF ) {
7290         if (aSig0 | aSig1) {
7291             return propagateFloat128NaN(a, b, status);
7292         }
7293         return a;
7294     }
7295     if ( bExp == 0 ) {
7296         --expDiff;
7297     }
7298     else {
7299         bSig0 |= UINT64_C(0x4000000000000000);
7300     }
7301     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7302     aSig0 |= UINT64_C(0x4000000000000000);
7303  aBigger:
7304     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7305     zExp = aExp;
7306  normalizeRoundAndPack:
7307     --zExp;
7308     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7309                                          status);
7310 
7311 }
7312 
7313 /*----------------------------------------------------------------------------
7314 | Returns the result of adding the quadruple-precision floating-point values
7315 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7316 | for Binary Floating-Point Arithmetic.
7317 *----------------------------------------------------------------------------*/
7318 
7319 float128 float128_add(float128 a, float128 b, float_status *status)
7320 {
7321     bool aSign, bSign;
7322 
7323     aSign = extractFloat128Sign( a );
7324     bSign = extractFloat128Sign( b );
7325     if ( aSign == bSign ) {
7326         return addFloat128Sigs(a, b, aSign, status);
7327     }
7328     else {
7329         return subFloat128Sigs(a, b, aSign, status);
7330     }
7331 
7332 }
7333 
7334 /*----------------------------------------------------------------------------
7335 | Returns the result of subtracting the quadruple-precision floating-point
7336 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7337 | Standard for Binary Floating-Point Arithmetic.
7338 *----------------------------------------------------------------------------*/
7339 
7340 float128 float128_sub(float128 a, float128 b, float_status *status)
7341 {
7342     bool aSign, bSign;
7343 
7344     aSign = extractFloat128Sign( a );
7345     bSign = extractFloat128Sign( b );
7346     if ( aSign == bSign ) {
7347         return subFloat128Sigs(a, b, aSign, status);
7348     }
7349     else {
7350         return addFloat128Sigs(a, b, aSign, status);
7351     }
7352 
7353 }
7354 
7355 /*----------------------------------------------------------------------------
7356 | Returns the result of multiplying the quadruple-precision floating-point
7357 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7358 | Standard for Binary Floating-Point Arithmetic.
7359 *----------------------------------------------------------------------------*/
7360 
7361 float128 float128_mul(float128 a, float128 b, float_status *status)
7362 {
7363     bool aSign, bSign, zSign;
7364     int32_t aExp, bExp, zExp;
7365     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7366 
7367     aSig1 = extractFloat128Frac1( a );
7368     aSig0 = extractFloat128Frac0( a );
7369     aExp = extractFloat128Exp( a );
7370     aSign = extractFloat128Sign( a );
7371     bSig1 = extractFloat128Frac1( b );
7372     bSig0 = extractFloat128Frac0( b );
7373     bExp = extractFloat128Exp( b );
7374     bSign = extractFloat128Sign( b );
7375     zSign = aSign ^ bSign;
7376     if ( aExp == 0x7FFF ) {
7377         if (    ( aSig0 | aSig1 )
7378              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7379             return propagateFloat128NaN(a, b, status);
7380         }
7381         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7382         return packFloat128( zSign, 0x7FFF, 0, 0 );
7383     }
7384     if ( bExp == 0x7FFF ) {
7385         if (bSig0 | bSig1) {
7386             return propagateFloat128NaN(a, b, status);
7387         }
7388         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7389  invalid:
7390             float_raise(float_flag_invalid, status);
7391             return float128_default_nan(status);
7392         }
7393         return packFloat128( zSign, 0x7FFF, 0, 0 );
7394     }
7395     if ( aExp == 0 ) {
7396         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7397         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7398     }
7399     if ( bExp == 0 ) {
7400         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7401         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7402     }
7403     zExp = aExp + bExp - 0x4000;
7404     aSig0 |= UINT64_C(0x0001000000000000);
7405     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7406     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7407     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7408     zSig2 |= ( zSig3 != 0 );
7409     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7410         shift128ExtraRightJamming(
7411             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7412         ++zExp;
7413     }
7414     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7415 
7416 }
7417 
7418 /*----------------------------------------------------------------------------
7419 | Returns the result of dividing the quadruple-precision floating-point value
7420 | `a' by the corresponding value `b'.  The operation is performed according to
7421 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7422 *----------------------------------------------------------------------------*/
7423 
7424 float128 float128_div(float128 a, float128 b, float_status *status)
7425 {
7426     bool aSign, bSign, zSign;
7427     int32_t aExp, bExp, zExp;
7428     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7429     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7430 
7431     aSig1 = extractFloat128Frac1( a );
7432     aSig0 = extractFloat128Frac0( a );
7433     aExp = extractFloat128Exp( a );
7434     aSign = extractFloat128Sign( a );
7435     bSig1 = extractFloat128Frac1( b );
7436     bSig0 = extractFloat128Frac0( b );
7437     bExp = extractFloat128Exp( b );
7438     bSign = extractFloat128Sign( b );
7439     zSign = aSign ^ bSign;
7440     if ( aExp == 0x7FFF ) {
7441         if (aSig0 | aSig1) {
7442             return propagateFloat128NaN(a, b, status);
7443         }
7444         if ( bExp == 0x7FFF ) {
7445             if (bSig0 | bSig1) {
7446                 return propagateFloat128NaN(a, b, status);
7447             }
7448             goto invalid;
7449         }
7450         return packFloat128( zSign, 0x7FFF, 0, 0 );
7451     }
7452     if ( bExp == 0x7FFF ) {
7453         if (bSig0 | bSig1) {
7454             return propagateFloat128NaN(a, b, status);
7455         }
7456         return packFloat128( zSign, 0, 0, 0 );
7457     }
7458     if ( bExp == 0 ) {
7459         if ( ( bSig0 | bSig1 ) == 0 ) {
7460             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7461  invalid:
7462                 float_raise(float_flag_invalid, status);
7463                 return float128_default_nan(status);
7464             }
7465             float_raise(float_flag_divbyzero, status);
7466             return packFloat128( zSign, 0x7FFF, 0, 0 );
7467         }
7468         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7469     }
7470     if ( aExp == 0 ) {
7471         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7472         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7473     }
7474     zExp = aExp - bExp + 0x3FFD;
7475     shortShift128Left(
7476         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7477     shortShift128Left(
7478         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7479     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7480         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7481         ++zExp;
7482     }
7483     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7484     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7485     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7486     while ( (int64_t) rem0 < 0 ) {
7487         --zSig0;
7488         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7489     }
7490     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7491     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7492         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7493         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7494         while ( (int64_t) rem1 < 0 ) {
7495             --zSig1;
7496             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7497         }
7498         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7499     }
7500     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7501     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7502 
7503 }
7504 
7505 /*----------------------------------------------------------------------------
7506 | Returns the remainder of the quadruple-precision floating-point value `a'
7507 | with respect to the corresponding value `b'.  The operation is performed
7508 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7509 *----------------------------------------------------------------------------*/
7510 
7511 float128 float128_rem(float128 a, float128 b, float_status *status)
7512 {
7513     bool aSign, zSign;
7514     int32_t aExp, bExp, expDiff;
7515     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7516     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7517     int64_t sigMean0;
7518 
7519     aSig1 = extractFloat128Frac1( a );
7520     aSig0 = extractFloat128Frac0( a );
7521     aExp = extractFloat128Exp( a );
7522     aSign = extractFloat128Sign( a );
7523     bSig1 = extractFloat128Frac1( b );
7524     bSig0 = extractFloat128Frac0( b );
7525     bExp = extractFloat128Exp( b );
7526     if ( aExp == 0x7FFF ) {
7527         if (    ( aSig0 | aSig1 )
7528              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7529             return propagateFloat128NaN(a, b, status);
7530         }
7531         goto invalid;
7532     }
7533     if ( bExp == 0x7FFF ) {
7534         if (bSig0 | bSig1) {
7535             return propagateFloat128NaN(a, b, status);
7536         }
7537         return a;
7538     }
7539     if ( bExp == 0 ) {
7540         if ( ( bSig0 | bSig1 ) == 0 ) {
7541  invalid:
7542             float_raise(float_flag_invalid, status);
7543             return float128_default_nan(status);
7544         }
7545         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7546     }
7547     if ( aExp == 0 ) {
7548         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7549         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7550     }
7551     expDiff = aExp - bExp;
7552     if ( expDiff < -1 ) return a;
7553     shortShift128Left(
7554         aSig0 | UINT64_C(0x0001000000000000),
7555         aSig1,
7556         15 - ( expDiff < 0 ),
7557         &aSig0,
7558         &aSig1
7559     );
7560     shortShift128Left(
7561         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7562     q = le128( bSig0, bSig1, aSig0, aSig1 );
7563     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7564     expDiff -= 64;
7565     while ( 0 < expDiff ) {
7566         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7567         q = ( 4 < q ) ? q - 4 : 0;
7568         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7569         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7570         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7571         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7572         expDiff -= 61;
7573     }
7574     if ( -64 < expDiff ) {
7575         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7576         q = ( 4 < q ) ? q - 4 : 0;
7577         q >>= - expDiff;
7578         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7579         expDiff += 52;
7580         if ( expDiff < 0 ) {
7581             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7582         }
7583         else {
7584             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7585         }
7586         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7587         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7588     }
7589     else {
7590         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7591         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7592     }
7593     do {
7594         alternateASig0 = aSig0;
7595         alternateASig1 = aSig1;
7596         ++q;
7597         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7598     } while ( 0 <= (int64_t) aSig0 );
7599     add128(
7600         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7601     if (    ( sigMean0 < 0 )
7602          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7603         aSig0 = alternateASig0;
7604         aSig1 = alternateASig1;
7605     }
7606     zSign = ( (int64_t) aSig0 < 0 );
7607     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7608     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7609                                          status);
7610 }
7611 
7612 /*----------------------------------------------------------------------------
7613 | Returns the square root of the quadruple-precision floating-point value `a'.
7614 | The operation is performed according to the IEC/IEEE Standard for Binary
7615 | Floating-Point Arithmetic.
7616 *----------------------------------------------------------------------------*/
7617 
7618 float128 float128_sqrt(float128 a, float_status *status)
7619 {
7620     bool aSign;
7621     int32_t aExp, zExp;
7622     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7623     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7624 
7625     aSig1 = extractFloat128Frac1( a );
7626     aSig0 = extractFloat128Frac0( a );
7627     aExp = extractFloat128Exp( a );
7628     aSign = extractFloat128Sign( a );
7629     if ( aExp == 0x7FFF ) {
7630         if (aSig0 | aSig1) {
7631             return propagateFloat128NaN(a, a, status);
7632         }
7633         if ( ! aSign ) return a;
7634         goto invalid;
7635     }
7636     if ( aSign ) {
7637         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7638  invalid:
7639         float_raise(float_flag_invalid, status);
7640         return float128_default_nan(status);
7641     }
7642     if ( aExp == 0 ) {
7643         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7644         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7645     }
7646     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7647     aSig0 |= UINT64_C(0x0001000000000000);
7648     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7649     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7650     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7651     doubleZSig0 = zSig0<<1;
7652     mul64To128( zSig0, zSig0, &term0, &term1 );
7653     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7654     while ( (int64_t) rem0 < 0 ) {
7655         --zSig0;
7656         doubleZSig0 -= 2;
7657         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7658     }
7659     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7660     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7661         if ( zSig1 == 0 ) zSig1 = 1;
7662         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7663         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7664         mul64To128( zSig1, zSig1, &term2, &term3 );
7665         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7666         while ( (int64_t) rem1 < 0 ) {
7667             --zSig1;
7668             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7669             term3 |= 1;
7670             term2 |= doubleZSig0;
7671             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7672         }
7673         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7674     }
7675     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7676     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7677 
7678 }
7679 
7680 static inline FloatRelation
7681 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7682                           float_status *status)
7683 {
7684     bool aSign, bSign;
7685 
7686     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7687         float_raise(float_flag_invalid, status);
7688         return float_relation_unordered;
7689     }
7690     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7691           ( extractFloatx80Frac( a )<<1 ) ) ||
7692         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7693           ( extractFloatx80Frac( b )<<1 ) )) {
7694         if (!is_quiet ||
7695             floatx80_is_signaling_nan(a, status) ||
7696             floatx80_is_signaling_nan(b, status)) {
7697             float_raise(float_flag_invalid, status);
7698         }
7699         return float_relation_unordered;
7700     }
7701     aSign = extractFloatx80Sign( a );
7702     bSign = extractFloatx80Sign( b );
7703     if ( aSign != bSign ) {
7704 
7705         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7706              ( ( a.low | b.low ) == 0 ) ) {
7707             /* zero case */
7708             return float_relation_equal;
7709         } else {
7710             return 1 - (2 * aSign);
7711         }
7712     } else {
7713         /* Normalize pseudo-denormals before comparison.  */
7714         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7715             ++a.high;
7716         }
7717         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7718             ++b.high;
7719         }
7720         if (a.low == b.low && a.high == b.high) {
7721             return float_relation_equal;
7722         } else {
7723             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7724         }
7725     }
7726 }
7727 
7728 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7729 {
7730     return floatx80_compare_internal(a, b, 0, status);
7731 }
7732 
7733 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7734                                      float_status *status)
7735 {
7736     return floatx80_compare_internal(a, b, 1, status);
7737 }
7738 
7739 static inline FloatRelation
7740 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7741                           float_status *status)
7742 {
7743     bool aSign, bSign;
7744 
7745     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7746           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7747         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7748           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7749         if (!is_quiet ||
7750             float128_is_signaling_nan(a, status) ||
7751             float128_is_signaling_nan(b, status)) {
7752             float_raise(float_flag_invalid, status);
7753         }
7754         return float_relation_unordered;
7755     }
7756     aSign = extractFloat128Sign( a );
7757     bSign = extractFloat128Sign( b );
7758     if ( aSign != bSign ) {
7759         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7760             /* zero case */
7761             return float_relation_equal;
7762         } else {
7763             return 1 - (2 * aSign);
7764         }
7765     } else {
7766         if (a.low == b.low && a.high == b.high) {
7767             return float_relation_equal;
7768         } else {
7769             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7770         }
7771     }
7772 }
7773 
7774 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7775 {
7776     return float128_compare_internal(a, b, 0, status);
7777 }
7778 
7779 FloatRelation float128_compare_quiet(float128 a, float128 b,
7780                                      float_status *status)
7781 {
7782     return float128_compare_internal(a, b, 1, status);
7783 }
7784 
7785 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7786 {
7787     bool aSign;
7788     int32_t aExp;
7789     uint64_t aSig;
7790 
7791     if (floatx80_invalid_encoding(a)) {
7792         float_raise(float_flag_invalid, status);
7793         return floatx80_default_nan(status);
7794     }
7795     aSig = extractFloatx80Frac( a );
7796     aExp = extractFloatx80Exp( a );
7797     aSign = extractFloatx80Sign( a );
7798 
7799     if ( aExp == 0x7FFF ) {
7800         if ( aSig<<1 ) {
7801             return propagateFloatx80NaN(a, a, status);
7802         }
7803         return a;
7804     }
7805 
7806     if (aExp == 0) {
7807         if (aSig == 0) {
7808             return a;
7809         }
7810         aExp++;
7811     }
7812 
7813     if (n > 0x10000) {
7814         n = 0x10000;
7815     } else if (n < -0x10000) {
7816         n = -0x10000;
7817     }
7818 
7819     aExp += n;
7820     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7821                                          aSign, aExp, aSig, 0, status);
7822 }
7823 
7824 float128 float128_scalbn(float128 a, int n, float_status *status)
7825 {
7826     bool aSign;
7827     int32_t aExp;
7828     uint64_t aSig0, aSig1;
7829 
7830     aSig1 = extractFloat128Frac1( a );
7831     aSig0 = extractFloat128Frac0( a );
7832     aExp = extractFloat128Exp( a );
7833     aSign = extractFloat128Sign( a );
7834     if ( aExp == 0x7FFF ) {
7835         if ( aSig0 | aSig1 ) {
7836             return propagateFloat128NaN(a, a, status);
7837         }
7838         return a;
7839     }
7840     if (aExp != 0) {
7841         aSig0 |= UINT64_C(0x0001000000000000);
7842     } else if (aSig0 == 0 && aSig1 == 0) {
7843         return a;
7844     } else {
7845         aExp++;
7846     }
7847 
7848     if (n > 0x10000) {
7849         n = 0x10000;
7850     } else if (n < -0x10000) {
7851         n = -0x10000;
7852     }
7853 
7854     aExp += n - 1;
7855     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7856                                          , status);
7857 
7858 }
7859 
7860 static void __attribute__((constructor)) softfloat_init(void)
7861 {
7862     union_float64 ua, ub, uc, ur;
7863 
7864     if (QEMU_NO_HARDFLOAT) {
7865         return;
7866     }
7867     /*
7868      * Test that the host's FMA is not obviously broken. For example,
7869      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7870      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7871      */
7872     ua.s = 0x0020000000000001ULL;
7873     ub.s = 0x3ca0000000000000ULL;
7874     uc.s = 0x0020000000000000ULL;
7875     ur.h = fma(ua.h, ub.h, uc.h);
7876     if (ur.s != 0x0020000000000001ULL) {
7877         force_soft_fma = true;
7878     }
7879 }
7880