xref: /openbmc/qemu/fpu/softfloat.c (revision 65a117da)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         s->float_exception_flags |= float_flag_overflow;
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         s->float_exception_flags |= float_flag_overflow;
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
474 {
475     return unlikely(c >= float_class_qnan);
476 }
477 
478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
479 {
480     return c == float_class_snan;
481 }
482 
483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
484 {
485     return c == float_class_qnan;
486 }
487 
488 /*
489  * Structure holding all of the decomposed parts of a float. The
490  * exponent is unbiased and the fraction is normalized. All
491  * calculations are done with a 64 bit fraction and then rounded as
492  * appropriate for the final format.
493  *
494  * Thanks to the packed FloatClass a decent compiler should be able to
495  * fit the whole structure into registers and avoid using the stack
496  * for parameter passing.
497  */
498 
499 typedef struct {
500     uint64_t frac;
501     int32_t  exp;
502     FloatClass cls;
503     bool sign;
504 } FloatParts;
505 
506 #define DECOMPOSED_BINARY_POINT    (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
509 
510 /* Structure holding all of the relevant parameters for a format.
511  *   exp_size: the size of the exponent field
512  *   exp_bias: the offset applied to the exponent field
513  *   exp_max: the maximum normalised exponent
514  *   frac_size: the size of the fraction field
515  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516  * The following are computed based the size of fraction
517  *   frac_lsb: least significant bit of fraction
518  *   frac_lsbm1: the bit below the least significant bit (for rounding)
519  *   round_mask/roundeven_mask: masks used for rounding
520  * The following optional modifiers are available:
521  *   arm_althp: handle ARM Alternative Half Precision
522  */
523 typedef struct {
524     int exp_size;
525     int exp_bias;
526     int exp_max;
527     int frac_size;
528     int frac_shift;
529     uint64_t frac_lsb;
530     uint64_t frac_lsbm1;
531     uint64_t round_mask;
532     uint64_t roundeven_mask;
533     bool arm_althp;
534 } FloatFmt;
535 
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F)                                           \
538     .exp_size       = E,                                             \
539     .exp_bias       = ((1 << E) - 1) >> 1,                           \
540     .exp_max        = (1 << E) - 1,                                  \
541     .frac_size      = F,                                             \
542     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
543     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
544     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
545     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
546     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
547 
548 static const FloatFmt float16_params = {
549     FLOAT_PARAMS(5, 10)
550 };
551 
552 static const FloatFmt float16_params_ahp = {
553     FLOAT_PARAMS(5, 10),
554     .arm_althp = true
555 };
556 
557 static const FloatFmt float32_params = {
558     FLOAT_PARAMS(8, 23)
559 };
560 
561 static const FloatFmt float64_params = {
562     FLOAT_PARAMS(11, 52)
563 };
564 
565 /* Unpack a float to parts, but do not canonicalize.  */
566 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
567 {
568     const int sign_pos = fmt.frac_size + fmt.exp_size;
569 
570     return (FloatParts) {
571         .cls = float_class_unclassified,
572         .sign = extract64(raw, sign_pos, 1),
573         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
574         .frac = extract64(raw, 0, fmt.frac_size),
575     };
576 }
577 
578 static inline FloatParts float16_unpack_raw(float16 f)
579 {
580     return unpack_raw(float16_params, f);
581 }
582 
583 static inline FloatParts float32_unpack_raw(float32 f)
584 {
585     return unpack_raw(float32_params, f);
586 }
587 
588 static inline FloatParts float64_unpack_raw(float64 f)
589 {
590     return unpack_raw(float64_params, f);
591 }
592 
593 /* Pack a float from parts, but do not canonicalize.  */
594 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
595 {
596     const int sign_pos = fmt.frac_size + fmt.exp_size;
597     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
598     return deposit64(ret, sign_pos, 1, p.sign);
599 }
600 
601 static inline float16 float16_pack_raw(FloatParts p)
602 {
603     return make_float16(pack_raw(float16_params, p));
604 }
605 
606 static inline float32 float32_pack_raw(FloatParts p)
607 {
608     return make_float32(pack_raw(float32_params, p));
609 }
610 
611 static inline float64 float64_pack_raw(FloatParts p)
612 {
613     return make_float64(pack_raw(float64_params, p));
614 }
615 
616 /*----------------------------------------------------------------------------
617 | Functions and definitions to determine:  (1) whether tininess for underflow
618 | is detected before or after rounding by default, (2) what (if anything)
619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
621 | are propagated from function inputs to output.  These details are target-
622 | specific.
623 *----------------------------------------------------------------------------*/
624 #include "softfloat-specialize.inc.c"
625 
626 /* Canonicalize EXP and FRAC, setting CLS.  */
627 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
628                                   float_status *status)
629 {
630     if (part.exp == parm->exp_max && !parm->arm_althp) {
631         if (part.frac == 0) {
632             part.cls = float_class_inf;
633         } else {
634             part.frac <<= parm->frac_shift;
635             part.cls = (parts_is_snan_frac(part.frac, status)
636                         ? float_class_snan : float_class_qnan);
637         }
638     } else if (part.exp == 0) {
639         if (likely(part.frac == 0)) {
640             part.cls = float_class_zero;
641         } else if (status->flush_inputs_to_zero) {
642             float_raise(float_flag_input_denormal, status);
643             part.cls = float_class_zero;
644             part.frac = 0;
645         } else {
646             int shift = clz64(part.frac) - 1;
647             part.cls = float_class_normal;
648             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
649             part.frac <<= shift;
650         }
651     } else {
652         part.cls = float_class_normal;
653         part.exp -= parm->exp_bias;
654         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
655     }
656     return part;
657 }
658 
659 /* Round and uncanonicalize a floating-point number by parts. There
660  * are FRAC_SHIFT bits that may require rounding at the bottom of the
661  * fraction; these bits will be removed. The exponent will be biased
662  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
663  */
664 
665 static FloatParts round_canonical(FloatParts p, float_status *s,
666                                   const FloatFmt *parm)
667 {
668     const uint64_t frac_lsb = parm->frac_lsb;
669     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
670     const uint64_t round_mask = parm->round_mask;
671     const uint64_t roundeven_mask = parm->roundeven_mask;
672     const int exp_max = parm->exp_max;
673     const int frac_shift = parm->frac_shift;
674     uint64_t frac, inc;
675     int exp, flags = 0;
676     bool overflow_norm;
677 
678     frac = p.frac;
679     exp = p.exp;
680 
681     switch (p.cls) {
682     case float_class_normal:
683         switch (s->float_rounding_mode) {
684         case float_round_nearest_even:
685             overflow_norm = false;
686             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
687             break;
688         case float_round_ties_away:
689             overflow_norm = false;
690             inc = frac_lsbm1;
691             break;
692         case float_round_to_zero:
693             overflow_norm = true;
694             inc = 0;
695             break;
696         case float_round_up:
697             inc = p.sign ? 0 : round_mask;
698             overflow_norm = p.sign;
699             break;
700         case float_round_down:
701             inc = p.sign ? round_mask : 0;
702             overflow_norm = !p.sign;
703             break;
704         case float_round_to_odd:
705             overflow_norm = true;
706             inc = frac & frac_lsb ? 0 : round_mask;
707             break;
708         default:
709             g_assert_not_reached();
710         }
711 
712         exp += parm->exp_bias;
713         if (likely(exp > 0)) {
714             if (frac & round_mask) {
715                 flags |= float_flag_inexact;
716                 frac += inc;
717                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
718                     frac >>= 1;
719                     exp++;
720                 }
721             }
722             frac >>= frac_shift;
723 
724             if (parm->arm_althp) {
725                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
726                 if (unlikely(exp > exp_max)) {
727                     /* Overflow.  Return the maximum normal.  */
728                     flags = float_flag_invalid;
729                     exp = exp_max;
730                     frac = -1;
731                 }
732             } else if (unlikely(exp >= exp_max)) {
733                 flags |= float_flag_overflow | float_flag_inexact;
734                 if (overflow_norm) {
735                     exp = exp_max - 1;
736                     frac = -1;
737                 } else {
738                     p.cls = float_class_inf;
739                     goto do_inf;
740                 }
741             }
742         } else if (s->flush_to_zero) {
743             flags |= float_flag_output_denormal;
744             p.cls = float_class_zero;
745             goto do_zero;
746         } else {
747             bool is_tiny = s->tininess_before_rounding
748                         || (exp < 0)
749                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
750 
751             shift64RightJamming(frac, 1 - exp, &frac);
752             if (frac & round_mask) {
753                 /* Need to recompute round-to-even.  */
754                 switch (s->float_rounding_mode) {
755                 case float_round_nearest_even:
756                     inc = ((frac & roundeven_mask) != frac_lsbm1
757                            ? frac_lsbm1 : 0);
758                     break;
759                 case float_round_to_odd:
760                     inc = frac & frac_lsb ? 0 : round_mask;
761                     break;
762                 default:
763                     break;
764                 }
765                 flags |= float_flag_inexact;
766                 frac += inc;
767             }
768 
769             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
770             frac >>= frac_shift;
771 
772             if (is_tiny && (flags & float_flag_inexact)) {
773                 flags |= float_flag_underflow;
774             }
775             if (exp == 0 && frac == 0) {
776                 p.cls = float_class_zero;
777             }
778         }
779         break;
780 
781     case float_class_zero:
782     do_zero:
783         exp = 0;
784         frac = 0;
785         break;
786 
787     case float_class_inf:
788     do_inf:
789         assert(!parm->arm_althp);
790         exp = exp_max;
791         frac = 0;
792         break;
793 
794     case float_class_qnan:
795     case float_class_snan:
796         assert(!parm->arm_althp);
797         exp = exp_max;
798         frac >>= parm->frac_shift;
799         break;
800 
801     default:
802         g_assert_not_reached();
803     }
804 
805     float_raise(flags, s);
806     p.exp = exp;
807     p.frac = frac;
808     return p;
809 }
810 
811 /* Explicit FloatFmt version */
812 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
813                                             const FloatFmt *params)
814 {
815     return sf_canonicalize(float16_unpack_raw(f), params, s);
816 }
817 
818 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
819 {
820     return float16a_unpack_canonical(f, s, &float16_params);
821 }
822 
823 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
824                                              const FloatFmt *params)
825 {
826     return float16_pack_raw(round_canonical(p, s, params));
827 }
828 
829 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
830 {
831     return float16a_round_pack_canonical(p, s, &float16_params);
832 }
833 
834 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
835 {
836     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
837 }
838 
839 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
840 {
841     return float32_pack_raw(round_canonical(p, s, &float32_params));
842 }
843 
844 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
845 {
846     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
847 }
848 
849 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float64_pack_raw(round_canonical(p, s, &float64_params));
852 }
853 
854 static FloatParts return_nan(FloatParts a, float_status *s)
855 {
856     switch (a.cls) {
857     case float_class_snan:
858         s->float_exception_flags |= float_flag_invalid;
859         a = parts_silence_nan(a, s);
860         /* fall through */
861     case float_class_qnan:
862         if (s->default_nan_mode) {
863             return parts_default_nan(s);
864         }
865         break;
866 
867     default:
868         g_assert_not_reached();
869     }
870     return a;
871 }
872 
873 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
874 {
875     if (is_snan(a.cls) || is_snan(b.cls)) {
876         s->float_exception_flags |= float_flag_invalid;
877     }
878 
879     if (s->default_nan_mode) {
880         return parts_default_nan(s);
881     } else {
882         if (pickNaN(a.cls, b.cls,
883                     a.frac > b.frac ||
884                     (a.frac == b.frac && a.sign < b.sign))) {
885             a = b;
886         }
887         if (is_snan(a.cls)) {
888             return parts_silence_nan(a, s);
889         }
890     }
891     return a;
892 }
893 
894 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
895                                   bool inf_zero, float_status *s)
896 {
897     int which;
898 
899     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
900         s->float_exception_flags |= float_flag_invalid;
901     }
902 
903     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
904 
905     if (s->default_nan_mode) {
906         /* Note that this check is after pickNaNMulAdd so that function
907          * has an opportunity to set the Invalid flag.
908          */
909         which = 3;
910     }
911 
912     switch (which) {
913     case 0:
914         break;
915     case 1:
916         a = b;
917         break;
918     case 2:
919         a = c;
920         break;
921     case 3:
922         return parts_default_nan(s);
923     default:
924         g_assert_not_reached();
925     }
926 
927     if (is_snan(a.cls)) {
928         return parts_silence_nan(a, s);
929     }
930     return a;
931 }
932 
933 /*
934  * Returns the result of adding or subtracting the values of the
935  * floating-point values `a' and `b'. The operation is performed
936  * according to the IEC/IEEE Standard for Binary Floating-Point
937  * Arithmetic.
938  */
939 
940 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
941                                 float_status *s)
942 {
943     bool a_sign = a.sign;
944     bool b_sign = b.sign ^ subtract;
945 
946     if (a_sign != b_sign) {
947         /* Subtraction */
948 
949         if (a.cls == float_class_normal && b.cls == float_class_normal) {
950             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
951                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
952                 a.frac = a.frac - b.frac;
953             } else {
954                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
955                 a.frac = b.frac - a.frac;
956                 a.exp = b.exp;
957                 a_sign ^= 1;
958             }
959 
960             if (a.frac == 0) {
961                 a.cls = float_class_zero;
962                 a.sign = s->float_rounding_mode == float_round_down;
963             } else {
964                 int shift = clz64(a.frac) - 1;
965                 a.frac = a.frac << shift;
966                 a.exp = a.exp - shift;
967                 a.sign = a_sign;
968             }
969             return a;
970         }
971         if (is_nan(a.cls) || is_nan(b.cls)) {
972             return pick_nan(a, b, s);
973         }
974         if (a.cls == float_class_inf) {
975             if (b.cls == float_class_inf) {
976                 float_raise(float_flag_invalid, s);
977                 return parts_default_nan(s);
978             }
979             return a;
980         }
981         if (a.cls == float_class_zero && b.cls == float_class_zero) {
982             a.sign = s->float_rounding_mode == float_round_down;
983             return a;
984         }
985         if (a.cls == float_class_zero || b.cls == float_class_inf) {
986             b.sign = a_sign ^ 1;
987             return b;
988         }
989         if (b.cls == float_class_zero) {
990             return a;
991         }
992     } else {
993         /* Addition */
994         if (a.cls == float_class_normal && b.cls == float_class_normal) {
995             if (a.exp > b.exp) {
996                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
997             } else if (a.exp < b.exp) {
998                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
999                 a.exp = b.exp;
1000             }
1001             a.frac += b.frac;
1002             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1003                 shift64RightJamming(a.frac, 1, &a.frac);
1004                 a.exp += 1;
1005             }
1006             return a;
1007         }
1008         if (is_nan(a.cls) || is_nan(b.cls)) {
1009             return pick_nan(a, b, s);
1010         }
1011         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1012             return a;
1013         }
1014         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1015             b.sign = b_sign;
1016             return b;
1017         }
1018     }
1019     g_assert_not_reached();
1020 }
1021 
1022 /*
1023  * Returns the result of adding or subtracting the floating-point
1024  * values `a' and `b'. The operation is performed according to the
1025  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1026  */
1027 
1028 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1029 {
1030     FloatParts pa = float16_unpack_canonical(a, status);
1031     FloatParts pb = float16_unpack_canonical(b, status);
1032     FloatParts pr = addsub_floats(pa, pb, false, status);
1033 
1034     return float16_round_pack_canonical(pr, status);
1035 }
1036 
1037 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1038 {
1039     FloatParts pa = float16_unpack_canonical(a, status);
1040     FloatParts pb = float16_unpack_canonical(b, status);
1041     FloatParts pr = addsub_floats(pa, pb, true, status);
1042 
1043     return float16_round_pack_canonical(pr, status);
1044 }
1045 
1046 static float32 QEMU_SOFTFLOAT_ATTR
1047 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1048 {
1049     FloatParts pa = float32_unpack_canonical(a, status);
1050     FloatParts pb = float32_unpack_canonical(b, status);
1051     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1052 
1053     return float32_round_pack_canonical(pr, status);
1054 }
1055 
1056 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1057 {
1058     return soft_f32_addsub(a, b, false, status);
1059 }
1060 
1061 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1062 {
1063     return soft_f32_addsub(a, b, true, status);
1064 }
1065 
1066 static float64 QEMU_SOFTFLOAT_ATTR
1067 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float64_unpack_canonical(a, status);
1070     FloatParts pb = float64_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float64_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1077 {
1078     return soft_f64_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1082 {
1083     return soft_f64_addsub(a, b, true, status);
1084 }
1085 
1086 static float hard_f32_add(float a, float b)
1087 {
1088     return a + b;
1089 }
1090 
1091 static float hard_f32_sub(float a, float b)
1092 {
1093     return a - b;
1094 }
1095 
1096 static double hard_f64_add(double a, double b)
1097 {
1098     return a + b;
1099 }
1100 
1101 static double hard_f64_sub(double a, double b)
1102 {
1103     return a - b;
1104 }
1105 
1106 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1107 {
1108     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1109         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1110     }
1111     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1112 }
1113 
1114 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1115 {
1116     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1117         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1118     } else {
1119         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1120     }
1121 }
1122 
1123 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1124                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1125 {
1126     return float32_gen2(a, b, s, hard, soft,
1127                         f32_is_zon2, f32_addsubmul_post);
1128 }
1129 
1130 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1131                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1132 {
1133     return float64_gen2(a, b, s, hard, soft,
1134                         f64_is_zon2, f64_addsubmul_post);
1135 }
1136 
1137 float32 QEMU_FLATTEN
1138 float32_add(float32 a, float32 b, float_status *s)
1139 {
1140     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1141 }
1142 
1143 float32 QEMU_FLATTEN
1144 float32_sub(float32 a, float32 b, float_status *s)
1145 {
1146     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1147 }
1148 
1149 float64 QEMU_FLATTEN
1150 float64_add(float64 a, float64 b, float_status *s)
1151 {
1152     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1153 }
1154 
1155 float64 QEMU_FLATTEN
1156 float64_sub(float64 a, float64 b, float_status *s)
1157 {
1158     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1159 }
1160 
1161 /*
1162  * Returns the result of multiplying the floating-point values `a' and
1163  * `b'. The operation is performed according to the IEC/IEEE Standard
1164  * for Binary Floating-Point Arithmetic.
1165  */
1166 
1167 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1168 {
1169     bool sign = a.sign ^ b.sign;
1170 
1171     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1172         uint64_t hi, lo;
1173         int exp = a.exp + b.exp;
1174 
1175         mul64To128(a.frac, b.frac, &hi, &lo);
1176         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1177         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1178             shift64RightJamming(lo, 1, &lo);
1179             exp += 1;
1180         }
1181 
1182         /* Re-use a */
1183         a.exp = exp;
1184         a.sign = sign;
1185         a.frac = lo;
1186         return a;
1187     }
1188     /* handle all the NaN cases */
1189     if (is_nan(a.cls) || is_nan(b.cls)) {
1190         return pick_nan(a, b, s);
1191     }
1192     /* Inf * Zero == NaN */
1193     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1194         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1195         s->float_exception_flags |= float_flag_invalid;
1196         return parts_default_nan(s);
1197     }
1198     /* Multiply by 0 or Inf */
1199     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1200         a.sign = sign;
1201         return a;
1202     }
1203     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1204         b.sign = sign;
1205         return b;
1206     }
1207     g_assert_not_reached();
1208 }
1209 
1210 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1211 {
1212     FloatParts pa = float16_unpack_canonical(a, status);
1213     FloatParts pb = float16_unpack_canonical(b, status);
1214     FloatParts pr = mul_floats(pa, pb, status);
1215 
1216     return float16_round_pack_canonical(pr, status);
1217 }
1218 
1219 static float32 QEMU_SOFTFLOAT_ATTR
1220 soft_f32_mul(float32 a, float32 b, float_status *status)
1221 {
1222     FloatParts pa = float32_unpack_canonical(a, status);
1223     FloatParts pb = float32_unpack_canonical(b, status);
1224     FloatParts pr = mul_floats(pa, pb, status);
1225 
1226     return float32_round_pack_canonical(pr, status);
1227 }
1228 
1229 static float64 QEMU_SOFTFLOAT_ATTR
1230 soft_f64_mul(float64 a, float64 b, float_status *status)
1231 {
1232     FloatParts pa = float64_unpack_canonical(a, status);
1233     FloatParts pb = float64_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float64_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float hard_f32_mul(float a, float b)
1240 {
1241     return a * b;
1242 }
1243 
1244 static double hard_f64_mul(double a, double b)
1245 {
1246     return a * b;
1247 }
1248 
1249 float32 QEMU_FLATTEN
1250 float32_mul(float32 a, float32 b, float_status *s)
1251 {
1252     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1253                         f32_is_zon2, f32_addsubmul_post);
1254 }
1255 
1256 float64 QEMU_FLATTEN
1257 float64_mul(float64 a, float64 b, float_status *s)
1258 {
1259     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1260                         f64_is_zon2, f64_addsubmul_post);
1261 }
1262 
1263 /*
1264  * Returns the result of multiplying the floating-point values `a' and
1265  * `b' then adding 'c', with no intermediate rounding step after the
1266  * multiplication. The operation is performed according to the
1267  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1268  * The flags argument allows the caller to select negation of the
1269  * addend, the intermediate product, or the final result. (The
1270  * difference between this and having the caller do a separate
1271  * negation is that negating externally will flip the sign bit on
1272  * NaNs.)
1273  */
1274 
1275 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1276                                 int flags, float_status *s)
1277 {
1278     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1279                     ((1 << float_class_inf) | (1 << float_class_zero));
1280     bool p_sign;
1281     bool sign_flip = flags & float_muladd_negate_result;
1282     FloatClass p_class;
1283     uint64_t hi, lo;
1284     int p_exp;
1285 
1286     /* It is implementation-defined whether the cases of (0,inf,qnan)
1287      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1288      * they return if they do), so we have to hand this information
1289      * off to the target-specific pick-a-NaN routine.
1290      */
1291     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1292         return pick_nan_muladd(a, b, c, inf_zero, s);
1293     }
1294 
1295     if (inf_zero) {
1296         s->float_exception_flags |= float_flag_invalid;
1297         return parts_default_nan(s);
1298     }
1299 
1300     if (flags & float_muladd_negate_c) {
1301         c.sign ^= 1;
1302     }
1303 
1304     p_sign = a.sign ^ b.sign;
1305 
1306     if (flags & float_muladd_negate_product) {
1307         p_sign ^= 1;
1308     }
1309 
1310     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1311         p_class = float_class_inf;
1312     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1313         p_class = float_class_zero;
1314     } else {
1315         p_class = float_class_normal;
1316     }
1317 
1318     if (c.cls == float_class_inf) {
1319         if (p_class == float_class_inf && p_sign != c.sign) {
1320             s->float_exception_flags |= float_flag_invalid;
1321             return parts_default_nan(s);
1322         } else {
1323             a.cls = float_class_inf;
1324             a.sign = c.sign ^ sign_flip;
1325             return a;
1326         }
1327     }
1328 
1329     if (p_class == float_class_inf) {
1330         a.cls = float_class_inf;
1331         a.sign = p_sign ^ sign_flip;
1332         return a;
1333     }
1334 
1335     if (p_class == float_class_zero) {
1336         if (c.cls == float_class_zero) {
1337             if (p_sign != c.sign) {
1338                 p_sign = s->float_rounding_mode == float_round_down;
1339             }
1340             c.sign = p_sign;
1341         } else if (flags & float_muladd_halve_result) {
1342             c.exp -= 1;
1343         }
1344         c.sign ^= sign_flip;
1345         return c;
1346     }
1347 
1348     /* a & b should be normals now... */
1349     assert(a.cls == float_class_normal &&
1350            b.cls == float_class_normal);
1351 
1352     p_exp = a.exp + b.exp;
1353 
1354     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1355      * result.
1356      */
1357     mul64To128(a.frac, b.frac, &hi, &lo);
1358     /* binary point now at bit 124 */
1359 
1360     /* check for overflow */
1361     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1362         shift128RightJamming(hi, lo, 1, &hi, &lo);
1363         p_exp += 1;
1364     }
1365 
1366     /* + add/sub */
1367     if (c.cls == float_class_zero) {
1368         /* move binary point back to 62 */
1369         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1370     } else {
1371         int exp_diff = p_exp - c.exp;
1372         if (p_sign == c.sign) {
1373             /* Addition */
1374             if (exp_diff <= 0) {
1375                 shift128RightJamming(hi, lo,
1376                                      DECOMPOSED_BINARY_POINT - exp_diff,
1377                                      &hi, &lo);
1378                 lo += c.frac;
1379                 p_exp = c.exp;
1380             } else {
1381                 uint64_t c_hi, c_lo;
1382                 /* shift c to the same binary point as the product (124) */
1383                 c_hi = c.frac >> 2;
1384                 c_lo = 0;
1385                 shift128RightJamming(c_hi, c_lo,
1386                                      exp_diff,
1387                                      &c_hi, &c_lo);
1388                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1389                 /* move binary point back to 62 */
1390                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1391             }
1392 
1393             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1394                 shift64RightJamming(lo, 1, &lo);
1395                 p_exp += 1;
1396             }
1397 
1398         } else {
1399             /* Subtraction */
1400             uint64_t c_hi, c_lo;
1401             /* make C binary point match product at bit 124 */
1402             c_hi = c.frac >> 2;
1403             c_lo = 0;
1404 
1405             if (exp_diff <= 0) {
1406                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1407                 if (exp_diff == 0
1408                     &&
1409                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1410                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1411                 } else {
1412                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1413                     p_sign ^= 1;
1414                     p_exp = c.exp;
1415                 }
1416             } else {
1417                 shift128RightJamming(c_hi, c_lo,
1418                                      exp_diff,
1419                                      &c_hi, &c_lo);
1420                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1421             }
1422 
1423             if (hi == 0 && lo == 0) {
1424                 a.cls = float_class_zero;
1425                 a.sign = s->float_rounding_mode == float_round_down;
1426                 a.sign ^= sign_flip;
1427                 return a;
1428             } else {
1429                 int shift;
1430                 if (hi != 0) {
1431                     shift = clz64(hi);
1432                 } else {
1433                     shift = clz64(lo) + 64;
1434                 }
1435                 /* Normalizing to a binary point of 124 is the
1436                    correct adjust for the exponent.  However since we're
1437                    shifting, we might as well put the binary point back
1438                    at 62 where we really want it.  Therefore shift as
1439                    if we're leaving 1 bit at the top of the word, but
1440                    adjust the exponent as if we're leaving 3 bits.  */
1441                 shift -= 1;
1442                 if (shift >= 64) {
1443                     lo = lo << (shift - 64);
1444                 } else {
1445                     hi = (hi << shift) | (lo >> (64 - shift));
1446                     lo = hi | ((lo << shift) != 0);
1447                 }
1448                 p_exp -= shift - 2;
1449             }
1450         }
1451     }
1452 
1453     if (flags & float_muladd_halve_result) {
1454         p_exp -= 1;
1455     }
1456 
1457     /* finally prepare our result */
1458     a.cls = float_class_normal;
1459     a.sign = p_sign ^ sign_flip;
1460     a.exp = p_exp;
1461     a.frac = lo;
1462 
1463     return a;
1464 }
1465 
1466 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1467                                                 int flags, float_status *status)
1468 {
1469     FloatParts pa = float16_unpack_canonical(a, status);
1470     FloatParts pb = float16_unpack_canonical(b, status);
1471     FloatParts pc = float16_unpack_canonical(c, status);
1472     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1473 
1474     return float16_round_pack_canonical(pr, status);
1475 }
1476 
1477 static float32 QEMU_SOFTFLOAT_ATTR
1478 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1479                 float_status *status)
1480 {
1481     FloatParts pa = float32_unpack_canonical(a, status);
1482     FloatParts pb = float32_unpack_canonical(b, status);
1483     FloatParts pc = float32_unpack_canonical(c, status);
1484     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1485 
1486     return float32_round_pack_canonical(pr, status);
1487 }
1488 
1489 static float64 QEMU_SOFTFLOAT_ATTR
1490 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1491                 float_status *status)
1492 {
1493     FloatParts pa = float64_unpack_canonical(a, status);
1494     FloatParts pb = float64_unpack_canonical(b, status);
1495     FloatParts pc = float64_unpack_canonical(c, status);
1496     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1497 
1498     return float64_round_pack_canonical(pr, status);
1499 }
1500 
1501 static bool force_soft_fma;
1502 
1503 float32 QEMU_FLATTEN
1504 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1505 {
1506     union_float32 ua, ub, uc, ur;
1507 
1508     ua.s = xa;
1509     ub.s = xb;
1510     uc.s = xc;
1511 
1512     if (unlikely(!can_use_fpu(s))) {
1513         goto soft;
1514     }
1515     if (unlikely(flags & float_muladd_halve_result)) {
1516         goto soft;
1517     }
1518 
1519     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1520     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1521         goto soft;
1522     }
1523 
1524     if (unlikely(force_soft_fma)) {
1525         goto soft;
1526     }
1527 
1528     /*
1529      * When (a || b) == 0, there's no need to check for under/over flow,
1530      * since we know the addend is (normal || 0) and the product is 0.
1531      */
1532     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1533         union_float32 up;
1534         bool prod_sign;
1535 
1536         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1537         prod_sign ^= !!(flags & float_muladd_negate_product);
1538         up.s = float32_set_sign(float32_zero, prod_sign);
1539 
1540         if (flags & float_muladd_negate_c) {
1541             uc.h = -uc.h;
1542         }
1543         ur.h = up.h + uc.h;
1544     } else {
1545         union_float32 ua_orig = ua;
1546         union_float32 uc_orig = uc;
1547 
1548         if (flags & float_muladd_negate_product) {
1549             ua.h = -ua.h;
1550         }
1551         if (flags & float_muladd_negate_c) {
1552             uc.h = -uc.h;
1553         }
1554 
1555         ur.h = fmaf(ua.h, ub.h, uc.h);
1556 
1557         if (unlikely(f32_is_inf(ur))) {
1558             s->float_exception_flags |= float_flag_overflow;
1559         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1560             ua = ua_orig;
1561             uc = uc_orig;
1562             goto soft;
1563         }
1564     }
1565     if (flags & float_muladd_negate_result) {
1566         return float32_chs(ur.s);
1567     }
1568     return ur.s;
1569 
1570  soft:
1571     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1572 }
1573 
1574 float64 QEMU_FLATTEN
1575 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1576 {
1577     union_float64 ua, ub, uc, ur;
1578 
1579     ua.s = xa;
1580     ub.s = xb;
1581     uc.s = xc;
1582 
1583     if (unlikely(!can_use_fpu(s))) {
1584         goto soft;
1585     }
1586     if (unlikely(flags & float_muladd_halve_result)) {
1587         goto soft;
1588     }
1589 
1590     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1591     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1592         goto soft;
1593     }
1594 
1595     if (unlikely(force_soft_fma)) {
1596         goto soft;
1597     }
1598 
1599     /*
1600      * When (a || b) == 0, there's no need to check for under/over flow,
1601      * since we know the addend is (normal || 0) and the product is 0.
1602      */
1603     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1604         union_float64 up;
1605         bool prod_sign;
1606 
1607         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1608         prod_sign ^= !!(flags & float_muladd_negate_product);
1609         up.s = float64_set_sign(float64_zero, prod_sign);
1610 
1611         if (flags & float_muladd_negate_c) {
1612             uc.h = -uc.h;
1613         }
1614         ur.h = up.h + uc.h;
1615     } else {
1616         union_float64 ua_orig = ua;
1617         union_float64 uc_orig = uc;
1618 
1619         if (flags & float_muladd_negate_product) {
1620             ua.h = -ua.h;
1621         }
1622         if (flags & float_muladd_negate_c) {
1623             uc.h = -uc.h;
1624         }
1625 
1626         ur.h = fma(ua.h, ub.h, uc.h);
1627 
1628         if (unlikely(f64_is_inf(ur))) {
1629             s->float_exception_flags |= float_flag_overflow;
1630         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1631             ua = ua_orig;
1632             uc = uc_orig;
1633             goto soft;
1634         }
1635     }
1636     if (flags & float_muladd_negate_result) {
1637         return float64_chs(ur.s);
1638     }
1639     return ur.s;
1640 
1641  soft:
1642     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1643 }
1644 
1645 /*
1646  * Returns the result of dividing the floating-point value `a' by the
1647  * corresponding value `b'. The operation is performed according to
1648  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1649  */
1650 
1651 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1652 {
1653     bool sign = a.sign ^ b.sign;
1654 
1655     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1656         uint64_t n0, n1, q, r;
1657         int exp = a.exp - b.exp;
1658 
1659         /*
1660          * We want a 2*N / N-bit division to produce exactly an N-bit
1661          * result, so that we do not lose any precision and so that we
1662          * do not have to renormalize afterward.  If A.frac < B.frac,
1663          * then division would produce an (N-1)-bit result; shift A left
1664          * by one to produce the an N-bit result, and decrement the
1665          * exponent to match.
1666          *
1667          * The udiv_qrnnd algorithm that we're using requires normalization,
1668          * i.e. the msb of the denominator must be set.  Since we know that
1669          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1670          * by one (more), and the remainder must be shifted right by one.
1671          */
1672         if (a.frac < b.frac) {
1673             exp -= 1;
1674             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1675         } else {
1676             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1677         }
1678         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1679 
1680         /*
1681          * Set lsb if there is a remainder, to set inexact.
1682          * As mentioned above, to find the actual value of the remainder we
1683          * would need to shift right, but (1) we are only concerned about
1684          * non-zero-ness, and (2) the remainder will always be even because
1685          * both inputs to the division primitive are even.
1686          */
1687         a.frac = q | (r != 0);
1688         a.sign = sign;
1689         a.exp = exp;
1690         return a;
1691     }
1692     /* handle all the NaN cases */
1693     if (is_nan(a.cls) || is_nan(b.cls)) {
1694         return pick_nan(a, b, s);
1695     }
1696     /* 0/0 or Inf/Inf */
1697     if (a.cls == b.cls
1698         &&
1699         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1700         s->float_exception_flags |= float_flag_invalid;
1701         return parts_default_nan(s);
1702     }
1703     /* Inf / x or 0 / x */
1704     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1705         a.sign = sign;
1706         return a;
1707     }
1708     /* Div 0 => Inf */
1709     if (b.cls == float_class_zero) {
1710         s->float_exception_flags |= float_flag_divbyzero;
1711         a.cls = float_class_inf;
1712         a.sign = sign;
1713         return a;
1714     }
1715     /* Div by Inf */
1716     if (b.cls == float_class_inf) {
1717         a.cls = float_class_zero;
1718         a.sign = sign;
1719         return a;
1720     }
1721     g_assert_not_reached();
1722 }
1723 
1724 float16 float16_div(float16 a, float16 b, float_status *status)
1725 {
1726     FloatParts pa = float16_unpack_canonical(a, status);
1727     FloatParts pb = float16_unpack_canonical(b, status);
1728     FloatParts pr = div_floats(pa, pb, status);
1729 
1730     return float16_round_pack_canonical(pr, status);
1731 }
1732 
1733 static float32 QEMU_SOFTFLOAT_ATTR
1734 soft_f32_div(float32 a, float32 b, float_status *status)
1735 {
1736     FloatParts pa = float32_unpack_canonical(a, status);
1737     FloatParts pb = float32_unpack_canonical(b, status);
1738     FloatParts pr = div_floats(pa, pb, status);
1739 
1740     return float32_round_pack_canonical(pr, status);
1741 }
1742 
1743 static float64 QEMU_SOFTFLOAT_ATTR
1744 soft_f64_div(float64 a, float64 b, float_status *status)
1745 {
1746     FloatParts pa = float64_unpack_canonical(a, status);
1747     FloatParts pb = float64_unpack_canonical(b, status);
1748     FloatParts pr = div_floats(pa, pb, status);
1749 
1750     return float64_round_pack_canonical(pr, status);
1751 }
1752 
1753 static float hard_f32_div(float a, float b)
1754 {
1755     return a / b;
1756 }
1757 
1758 static double hard_f64_div(double a, double b)
1759 {
1760     return a / b;
1761 }
1762 
1763 static bool f32_div_pre(union_float32 a, union_float32 b)
1764 {
1765     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1766         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1767                fpclassify(b.h) == FP_NORMAL;
1768     }
1769     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1770 }
1771 
1772 static bool f64_div_pre(union_float64 a, union_float64 b)
1773 {
1774     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1775         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1776                fpclassify(b.h) == FP_NORMAL;
1777     }
1778     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1779 }
1780 
1781 static bool f32_div_post(union_float32 a, union_float32 b)
1782 {
1783     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1784         return fpclassify(a.h) != FP_ZERO;
1785     }
1786     return !float32_is_zero(a.s);
1787 }
1788 
1789 static bool f64_div_post(union_float64 a, union_float64 b)
1790 {
1791     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1792         return fpclassify(a.h) != FP_ZERO;
1793     }
1794     return !float64_is_zero(a.s);
1795 }
1796 
1797 float32 QEMU_FLATTEN
1798 float32_div(float32 a, float32 b, float_status *s)
1799 {
1800     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1801                         f32_div_pre, f32_div_post);
1802 }
1803 
1804 float64 QEMU_FLATTEN
1805 float64_div(float64 a, float64 b, float_status *s)
1806 {
1807     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1808                         f64_div_pre, f64_div_post);
1809 }
1810 
1811 /*
1812  * Float to Float conversions
1813  *
1814  * Returns the result of converting one float format to another. The
1815  * conversion is performed according to the IEC/IEEE Standard for
1816  * Binary Floating-Point Arithmetic.
1817  *
1818  * The float_to_float helper only needs to take care of raising
1819  * invalid exceptions and handling the conversion on NaNs.
1820  */
1821 
1822 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1823                                  float_status *s)
1824 {
1825     if (dstf->arm_althp) {
1826         switch (a.cls) {
1827         case float_class_qnan:
1828         case float_class_snan:
1829             /* There is no NaN in the destination format.  Raise Invalid
1830              * and return a zero with the sign of the input NaN.
1831              */
1832             s->float_exception_flags |= float_flag_invalid;
1833             a.cls = float_class_zero;
1834             a.frac = 0;
1835             a.exp = 0;
1836             break;
1837 
1838         case float_class_inf:
1839             /* There is no Inf in the destination format.  Raise Invalid
1840              * and return the maximum normal with the correct sign.
1841              */
1842             s->float_exception_flags |= float_flag_invalid;
1843             a.cls = float_class_normal;
1844             a.exp = dstf->exp_max;
1845             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1846             break;
1847 
1848         default:
1849             break;
1850         }
1851     } else if (is_nan(a.cls)) {
1852         if (is_snan(a.cls)) {
1853             s->float_exception_flags |= float_flag_invalid;
1854             a = parts_silence_nan(a, s);
1855         }
1856         if (s->default_nan_mode) {
1857             return parts_default_nan(s);
1858         }
1859     }
1860     return a;
1861 }
1862 
1863 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1864 {
1865     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1866     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1867     FloatParts pr = float_to_float(p, &float32_params, s);
1868     return float32_round_pack_canonical(pr, s);
1869 }
1870 
1871 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1872 {
1873     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1874     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1875     FloatParts pr = float_to_float(p, &float64_params, s);
1876     return float64_round_pack_canonical(pr, s);
1877 }
1878 
1879 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1880 {
1881     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1882     FloatParts p = float32_unpack_canonical(a, s);
1883     FloatParts pr = float_to_float(p, fmt16, s);
1884     return float16a_round_pack_canonical(pr, s, fmt16);
1885 }
1886 
1887 static float64 QEMU_SOFTFLOAT_ATTR
1888 soft_float32_to_float64(float32 a, float_status *s)
1889 {
1890     FloatParts p = float32_unpack_canonical(a, s);
1891     FloatParts pr = float_to_float(p, &float64_params, s);
1892     return float64_round_pack_canonical(pr, s);
1893 }
1894 
1895 float64 float32_to_float64(float32 a, float_status *s)
1896 {
1897     if (likely(float32_is_normal(a))) {
1898         /* Widening conversion can never produce inexact results.  */
1899         union_float32 uf;
1900         union_float64 ud;
1901         uf.s = a;
1902         ud.h = uf.h;
1903         return ud.s;
1904     } else if (float32_is_zero(a)) {
1905         return float64_set_sign(float64_zero, float32_is_neg(a));
1906     } else {
1907         return soft_float32_to_float64(a, s);
1908     }
1909 }
1910 
1911 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1912 {
1913     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1914     FloatParts p = float64_unpack_canonical(a, s);
1915     FloatParts pr = float_to_float(p, fmt16, s);
1916     return float16a_round_pack_canonical(pr, s, fmt16);
1917 }
1918 
1919 float32 float64_to_float32(float64 a, float_status *s)
1920 {
1921     FloatParts p = float64_unpack_canonical(a, s);
1922     FloatParts pr = float_to_float(p, &float32_params, s);
1923     return float32_round_pack_canonical(pr, s);
1924 }
1925 
1926 /*
1927  * Rounds the floating-point value `a' to an integer, and returns the
1928  * result as a floating-point value. The operation is performed
1929  * according to the IEC/IEEE Standard for Binary Floating-Point
1930  * Arithmetic.
1931  */
1932 
1933 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
1934                                int scale, float_status *s)
1935 {
1936     switch (a.cls) {
1937     case float_class_qnan:
1938     case float_class_snan:
1939         return return_nan(a, s);
1940 
1941     case float_class_zero:
1942     case float_class_inf:
1943         /* already "integral" */
1944         break;
1945 
1946     case float_class_normal:
1947         scale = MIN(MAX(scale, -0x10000), 0x10000);
1948         a.exp += scale;
1949 
1950         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1951             /* already integral */
1952             break;
1953         }
1954         if (a.exp < 0) {
1955             bool one;
1956             /* all fractional */
1957             s->float_exception_flags |= float_flag_inexact;
1958             switch (rmode) {
1959             case float_round_nearest_even:
1960                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1961                 break;
1962             case float_round_ties_away:
1963                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1964                 break;
1965             case float_round_to_zero:
1966                 one = false;
1967                 break;
1968             case float_round_up:
1969                 one = !a.sign;
1970                 break;
1971             case float_round_down:
1972                 one = a.sign;
1973                 break;
1974             case float_round_to_odd:
1975                 one = true;
1976                 break;
1977             default:
1978                 g_assert_not_reached();
1979             }
1980 
1981             if (one) {
1982                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1983                 a.exp = 0;
1984             } else {
1985                 a.cls = float_class_zero;
1986             }
1987         } else {
1988             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1989             uint64_t frac_lsbm1 = frac_lsb >> 1;
1990             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1991             uint64_t rnd_mask = rnd_even_mask >> 1;
1992             uint64_t inc;
1993 
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1997                 break;
1998             case float_round_ties_away:
1999                 inc = frac_lsbm1;
2000                 break;
2001             case float_round_to_zero:
2002                 inc = 0;
2003                 break;
2004             case float_round_up:
2005                 inc = a.sign ? 0 : rnd_mask;
2006                 break;
2007             case float_round_down:
2008                 inc = a.sign ? rnd_mask : 0;
2009                 break;
2010             case float_round_to_odd:
2011                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (a.frac & rnd_mask) {
2018                 s->float_exception_flags |= float_flag_inexact;
2019                 a.frac += inc;
2020                 a.frac &= ~rnd_mask;
2021                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2022                     a.frac >>= 1;
2023                     a.exp++;
2024                 }
2025             }
2026         }
2027         break;
2028     default:
2029         g_assert_not_reached();
2030     }
2031     return a;
2032 }
2033 
2034 float16 float16_round_to_int(float16 a, float_status *s)
2035 {
2036     FloatParts pa = float16_unpack_canonical(a, s);
2037     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2038     return float16_round_pack_canonical(pr, s);
2039 }
2040 
2041 float32 float32_round_to_int(float32 a, float_status *s)
2042 {
2043     FloatParts pa = float32_unpack_canonical(a, s);
2044     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2045     return float32_round_pack_canonical(pr, s);
2046 }
2047 
2048 float64 float64_round_to_int(float64 a, float_status *s)
2049 {
2050     FloatParts pa = float64_unpack_canonical(a, s);
2051     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2052     return float64_round_pack_canonical(pr, s);
2053 }
2054 
2055 /*
2056  * Returns the result of converting the floating-point value `a' to
2057  * the two's complement integer format. The conversion is performed
2058  * according to the IEC/IEEE Standard for Binary Floating-Point
2059  * Arithmetic---which means in particular that the conversion is
2060  * rounded according to the current rounding mode. If `a' is a NaN,
2061  * the largest positive integer is returned. Otherwise, if the
2062  * conversion overflows, the largest integer with the same sign as `a'
2063  * is returned.
2064 */
2065 
2066 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2067                                      int scale, int64_t min, int64_t max,
2068                                      float_status *s)
2069 {
2070     uint64_t r;
2071     int orig_flags = get_float_exception_flags(s);
2072     FloatParts p = round_to_int(in, rmode, scale, s);
2073 
2074     switch (p.cls) {
2075     case float_class_snan:
2076     case float_class_qnan:
2077         s->float_exception_flags = orig_flags | float_flag_invalid;
2078         return max;
2079     case float_class_inf:
2080         s->float_exception_flags = orig_flags | float_flag_invalid;
2081         return p.sign ? min : max;
2082     case float_class_zero:
2083         return 0;
2084     case float_class_normal:
2085         if (p.exp < DECOMPOSED_BINARY_POINT) {
2086             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2087         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2088             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2089         } else {
2090             r = UINT64_MAX;
2091         }
2092         if (p.sign) {
2093             if (r <= -(uint64_t) min) {
2094                 return -r;
2095             } else {
2096                 s->float_exception_flags = orig_flags | float_flag_invalid;
2097                 return min;
2098             }
2099         } else {
2100             if (r <= max) {
2101                 return r;
2102             } else {
2103                 s->float_exception_flags = orig_flags | float_flag_invalid;
2104                 return max;
2105             }
2106         }
2107     default:
2108         g_assert_not_reached();
2109     }
2110 }
2111 
2112 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2113                                 float_status *s)
2114 {
2115     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2116                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2117 }
2118 
2119 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2120                                 float_status *s)
2121 {
2122     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2123                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2124 }
2125 
2126 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2127                                 float_status *s)
2128 {
2129     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2130                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2131 }
2132 
2133 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2134                                 float_status *s)
2135 {
2136     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2137                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2138 }
2139 
2140 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2141                                 float_status *s)
2142 {
2143     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2144                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2145 }
2146 
2147 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2148                                 float_status *s)
2149 {
2150     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2151                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2152 }
2153 
2154 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2155                                 float_status *s)
2156 {
2157     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2158                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2159 }
2160 
2161 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2162                                 float_status *s)
2163 {
2164     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2165                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2166 }
2167 
2168 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2169                                 float_status *s)
2170 {
2171     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2172                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2173 }
2174 
2175 int16_t float16_to_int16(float16 a, float_status *s)
2176 {
2177     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2178 }
2179 
2180 int32_t float16_to_int32(float16 a, float_status *s)
2181 {
2182     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2183 }
2184 
2185 int64_t float16_to_int64(float16 a, float_status *s)
2186 {
2187     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2188 }
2189 
2190 int16_t float32_to_int16(float32 a, float_status *s)
2191 {
2192     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2193 }
2194 
2195 int32_t float32_to_int32(float32 a, float_status *s)
2196 {
2197     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2198 }
2199 
2200 int64_t float32_to_int64(float32 a, float_status *s)
2201 {
2202     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2203 }
2204 
2205 int16_t float64_to_int16(float64 a, float_status *s)
2206 {
2207     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2208 }
2209 
2210 int32_t float64_to_int32(float64 a, float_status *s)
2211 {
2212     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2213 }
2214 
2215 int64_t float64_to_int64(float64 a, float_status *s)
2216 {
2217     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2218 }
2219 
2220 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2221 {
2222     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2223 }
2224 
2225 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2226 {
2227     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2228 }
2229 
2230 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2231 {
2232     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2233 }
2234 
2235 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2236 {
2237     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2238 }
2239 
2240 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2241 {
2242     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2243 }
2244 
2245 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2246 {
2247     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2248 }
2249 
2250 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2251 {
2252     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2253 }
2254 
2255 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2256 {
2257     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2258 }
2259 
2260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2261 {
2262     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2263 }
2264 
2265 /*
2266  *  Returns the result of converting the floating-point value `a' to
2267  *  the unsigned integer format. The conversion is performed according
2268  *  to the IEC/IEEE Standard for Binary Floating-Point
2269  *  Arithmetic---which means in particular that the conversion is
2270  *  rounded according to the current rounding mode. If `a' is a NaN,
2271  *  the largest unsigned integer is returned. Otherwise, if the
2272  *  conversion overflows, the largest unsigned integer is returned. If
2273  *  the 'a' is negative, the result is rounded and zero is returned;
2274  *  values that do not round to zero will raise the inexact exception
2275  *  flag.
2276  */
2277 
2278 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2279                                        int scale, uint64_t max,
2280                                        float_status *s)
2281 {
2282     int orig_flags = get_float_exception_flags(s);
2283     FloatParts p = round_to_int(in, rmode, scale, s);
2284     uint64_t r;
2285 
2286     switch (p.cls) {
2287     case float_class_snan:
2288     case float_class_qnan:
2289         s->float_exception_flags = orig_flags | float_flag_invalid;
2290         return max;
2291     case float_class_inf:
2292         s->float_exception_flags = orig_flags | float_flag_invalid;
2293         return p.sign ? 0 : max;
2294     case float_class_zero:
2295         return 0;
2296     case float_class_normal:
2297         if (p.sign) {
2298             s->float_exception_flags = orig_flags | float_flag_invalid;
2299             return 0;
2300         }
2301 
2302         if (p.exp < DECOMPOSED_BINARY_POINT) {
2303             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2304         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2305             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2306         } else {
2307             s->float_exception_flags = orig_flags | float_flag_invalid;
2308             return max;
2309         }
2310 
2311         /* For uint64 this will never trip, but if p.exp is too large
2312          * to shift a decomposed fraction we shall have exited via the
2313          * 3rd leg above.
2314          */
2315         if (r > max) {
2316             s->float_exception_flags = orig_flags | float_flag_invalid;
2317             return max;
2318         }
2319         return r;
2320     default:
2321         g_assert_not_reached();
2322     }
2323 }
2324 
2325 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2326                                   float_status *s)
2327 {
2328     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2329                                   rmode, scale, UINT16_MAX, s);
2330 }
2331 
2332 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2333                                   float_status *s)
2334 {
2335     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2336                                   rmode, scale, UINT32_MAX, s);
2337 }
2338 
2339 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2340                                   float_status *s)
2341 {
2342     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2343                                   rmode, scale, UINT64_MAX, s);
2344 }
2345 
2346 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2347                                   float_status *s)
2348 {
2349     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2350                                   rmode, scale, UINT16_MAX, s);
2351 }
2352 
2353 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2354                                   float_status *s)
2355 {
2356     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2357                                   rmode, scale, UINT32_MAX, s);
2358 }
2359 
2360 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2364                                   rmode, scale, UINT64_MAX, s);
2365 }
2366 
2367 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2371                                   rmode, scale, UINT16_MAX, s);
2372 }
2373 
2374 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2378                                   rmode, scale, UINT32_MAX, s);
2379 }
2380 
2381 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2385                                   rmode, scale, UINT64_MAX, s);
2386 }
2387 
2388 uint16_t float16_to_uint16(float16 a, float_status *s)
2389 {
2390     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2391 }
2392 
2393 uint32_t float16_to_uint32(float16 a, float_status *s)
2394 {
2395     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2396 }
2397 
2398 uint64_t float16_to_uint64(float16 a, float_status *s)
2399 {
2400     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2401 }
2402 
2403 uint16_t float32_to_uint16(float32 a, float_status *s)
2404 {
2405     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2406 }
2407 
2408 uint32_t float32_to_uint32(float32 a, float_status *s)
2409 {
2410     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2411 }
2412 
2413 uint64_t float32_to_uint64(float32 a, float_status *s)
2414 {
2415     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2416 }
2417 
2418 uint16_t float64_to_uint16(float64 a, float_status *s)
2419 {
2420     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2421 }
2422 
2423 uint32_t float64_to_uint32(float64 a, float_status *s)
2424 {
2425     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint64_t float64_to_uint64(float64 a, float_status *s)
2429 {
2430     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2434 {
2435     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2436 }
2437 
2438 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2439 {
2440     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2441 }
2442 
2443 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2444 {
2445     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2446 }
2447 
2448 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2449 {
2450     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2451 }
2452 
2453 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2454 {
2455     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2456 }
2457 
2458 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2459 {
2460     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2461 }
2462 
2463 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2464 {
2465     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2466 }
2467 
2468 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2469 {
2470     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2474 {
2475     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 /*
2479  * Integer to float conversions
2480  *
2481  * Returns the result of converting the two's complement integer `a'
2482  * to the floating-point format. The conversion is performed according
2483  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2484  */
2485 
2486 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2487 {
2488     FloatParts r = { .sign = false };
2489 
2490     if (a == 0) {
2491         r.cls = float_class_zero;
2492     } else {
2493         uint64_t f = a;
2494         int shift;
2495 
2496         r.cls = float_class_normal;
2497         if (a < 0) {
2498             f = -f;
2499             r.sign = true;
2500         }
2501         shift = clz64(f) - 1;
2502         scale = MIN(MAX(scale, -0x10000), 0x10000);
2503 
2504         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2505         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2506     }
2507 
2508     return r;
2509 }
2510 
2511 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2512 {
2513     FloatParts pa = int_to_float(a, scale, status);
2514     return float16_round_pack_canonical(pa, status);
2515 }
2516 
2517 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2518 {
2519     return int64_to_float16_scalbn(a, scale, status);
2520 }
2521 
2522 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2523 {
2524     return int64_to_float16_scalbn(a, scale, status);
2525 }
2526 
2527 float16 int64_to_float16(int64_t a, float_status *status)
2528 {
2529     return int64_to_float16_scalbn(a, 0, status);
2530 }
2531 
2532 float16 int32_to_float16(int32_t a, float_status *status)
2533 {
2534     return int64_to_float16_scalbn(a, 0, status);
2535 }
2536 
2537 float16 int16_to_float16(int16_t a, float_status *status)
2538 {
2539     return int64_to_float16_scalbn(a, 0, status);
2540 }
2541 
2542 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2543 {
2544     FloatParts pa = int_to_float(a, scale, status);
2545     return float32_round_pack_canonical(pa, status);
2546 }
2547 
2548 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2549 {
2550     return int64_to_float32_scalbn(a, scale, status);
2551 }
2552 
2553 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2554 {
2555     return int64_to_float32_scalbn(a, scale, status);
2556 }
2557 
2558 float32 int64_to_float32(int64_t a, float_status *status)
2559 {
2560     return int64_to_float32_scalbn(a, 0, status);
2561 }
2562 
2563 float32 int32_to_float32(int32_t a, float_status *status)
2564 {
2565     return int64_to_float32_scalbn(a, 0, status);
2566 }
2567 
2568 float32 int16_to_float32(int16_t a, float_status *status)
2569 {
2570     return int64_to_float32_scalbn(a, 0, status);
2571 }
2572 
2573 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2574 {
2575     FloatParts pa = int_to_float(a, scale, status);
2576     return float64_round_pack_canonical(pa, status);
2577 }
2578 
2579 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2580 {
2581     return int64_to_float64_scalbn(a, scale, status);
2582 }
2583 
2584 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2585 {
2586     return int64_to_float64_scalbn(a, scale, status);
2587 }
2588 
2589 float64 int64_to_float64(int64_t a, float_status *status)
2590 {
2591     return int64_to_float64_scalbn(a, 0, status);
2592 }
2593 
2594 float64 int32_to_float64(int32_t a, float_status *status)
2595 {
2596     return int64_to_float64_scalbn(a, 0, status);
2597 }
2598 
2599 float64 int16_to_float64(int16_t a, float_status *status)
2600 {
2601     return int64_to_float64_scalbn(a, 0, status);
2602 }
2603 
2604 
2605 /*
2606  * Unsigned Integer to float conversions
2607  *
2608  * Returns the result of converting the unsigned integer `a' to the
2609  * floating-point format. The conversion is performed according to the
2610  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611  */
2612 
2613 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2614 {
2615     FloatParts r = { .sign = false };
2616 
2617     if (a == 0) {
2618         r.cls = float_class_zero;
2619     } else {
2620         scale = MIN(MAX(scale, -0x10000), 0x10000);
2621         r.cls = float_class_normal;
2622         if ((int64_t)a < 0) {
2623             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2624             shift64RightJamming(a, 1, &a);
2625             r.frac = a;
2626         } else {
2627             int shift = clz64(a) - 1;
2628             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2629             r.frac = a << shift;
2630         }
2631     }
2632 
2633     return r;
2634 }
2635 
2636 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2637 {
2638     FloatParts pa = uint_to_float(a, scale, status);
2639     return float16_round_pack_canonical(pa, status);
2640 }
2641 
2642 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2643 {
2644     return uint64_to_float16_scalbn(a, scale, status);
2645 }
2646 
2647 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2648 {
2649     return uint64_to_float16_scalbn(a, scale, status);
2650 }
2651 
2652 float16 uint64_to_float16(uint64_t a, float_status *status)
2653 {
2654     return uint64_to_float16_scalbn(a, 0, status);
2655 }
2656 
2657 float16 uint32_to_float16(uint32_t a, float_status *status)
2658 {
2659     return uint64_to_float16_scalbn(a, 0, status);
2660 }
2661 
2662 float16 uint16_to_float16(uint16_t a, float_status *status)
2663 {
2664     return uint64_to_float16_scalbn(a, 0, status);
2665 }
2666 
2667 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2668 {
2669     FloatParts pa = uint_to_float(a, scale, status);
2670     return float32_round_pack_canonical(pa, status);
2671 }
2672 
2673 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2674 {
2675     return uint64_to_float32_scalbn(a, scale, status);
2676 }
2677 
2678 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2679 {
2680     return uint64_to_float32_scalbn(a, scale, status);
2681 }
2682 
2683 float32 uint64_to_float32(uint64_t a, float_status *status)
2684 {
2685     return uint64_to_float32_scalbn(a, 0, status);
2686 }
2687 
2688 float32 uint32_to_float32(uint32_t a, float_status *status)
2689 {
2690     return uint64_to_float32_scalbn(a, 0, status);
2691 }
2692 
2693 float32 uint16_to_float32(uint16_t a, float_status *status)
2694 {
2695     return uint64_to_float32_scalbn(a, 0, status);
2696 }
2697 
2698 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2699 {
2700     FloatParts pa = uint_to_float(a, scale, status);
2701     return float64_round_pack_canonical(pa, status);
2702 }
2703 
2704 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2705 {
2706     return uint64_to_float64_scalbn(a, scale, status);
2707 }
2708 
2709 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2710 {
2711     return uint64_to_float64_scalbn(a, scale, status);
2712 }
2713 
2714 float64 uint64_to_float64(uint64_t a, float_status *status)
2715 {
2716     return uint64_to_float64_scalbn(a, 0, status);
2717 }
2718 
2719 float64 uint32_to_float64(uint32_t a, float_status *status)
2720 {
2721     return uint64_to_float64_scalbn(a, 0, status);
2722 }
2723 
2724 float64 uint16_to_float64(uint16_t a, float_status *status)
2725 {
2726     return uint64_to_float64_scalbn(a, 0, status);
2727 }
2728 
2729 /* Float Min/Max */
2730 /* min() and max() functions. These can't be implemented as
2731  * 'compare and pick one input' because that would mishandle
2732  * NaNs and +0 vs -0.
2733  *
2734  * minnum() and maxnum() functions. These are similar to the min()
2735  * and max() functions but if one of the arguments is a QNaN and
2736  * the other is numerical then the numerical argument is returned.
2737  * SNaNs will get quietened before being returned.
2738  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2739  * and maxNum() operations. min() and max() are the typical min/max
2740  * semantics provided by many CPUs which predate that specification.
2741  *
2742  * minnummag() and maxnummag() functions correspond to minNumMag()
2743  * and minNumMag() from the IEEE-754 2008.
2744  */
2745 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2746                                 bool ieee, bool ismag, float_status *s)
2747 {
2748     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2749         if (ieee) {
2750             /* Takes two floating-point values `a' and `b', one of
2751              * which is a NaN, and returns the appropriate NaN
2752              * result. If either `a' or `b' is a signaling NaN,
2753              * the invalid exception is raised.
2754              */
2755             if (is_snan(a.cls) || is_snan(b.cls)) {
2756                 return pick_nan(a, b, s);
2757             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2758                 return b;
2759             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2760                 return a;
2761             }
2762         }
2763         return pick_nan(a, b, s);
2764     } else {
2765         int a_exp, b_exp;
2766 
2767         switch (a.cls) {
2768         case float_class_normal:
2769             a_exp = a.exp;
2770             break;
2771         case float_class_inf:
2772             a_exp = INT_MAX;
2773             break;
2774         case float_class_zero:
2775             a_exp = INT_MIN;
2776             break;
2777         default:
2778             g_assert_not_reached();
2779             break;
2780         }
2781         switch (b.cls) {
2782         case float_class_normal:
2783             b_exp = b.exp;
2784             break;
2785         case float_class_inf:
2786             b_exp = INT_MAX;
2787             break;
2788         case float_class_zero:
2789             b_exp = INT_MIN;
2790             break;
2791         default:
2792             g_assert_not_reached();
2793             break;
2794         }
2795 
2796         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2797             bool a_less = a_exp < b_exp;
2798             if (a_exp == b_exp) {
2799                 a_less = a.frac < b.frac;
2800             }
2801             return a_less ^ ismin ? b : a;
2802         }
2803 
2804         if (a.sign == b.sign) {
2805             bool a_less = a_exp < b_exp;
2806             if (a_exp == b_exp) {
2807                 a_less = a.frac < b.frac;
2808             }
2809             return a.sign ^ a_less ^ ismin ? b : a;
2810         } else {
2811             return a.sign ^ ismin ? b : a;
2812         }
2813     }
2814 }
2815 
2816 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2818                                      float_status *s)                   \
2819 {                                                                       \
2820     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2821     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2822     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2823                                                                         \
2824     return float ## sz ## _round_pack_canonical(pr, s);                 \
2825 }
2826 
2827 MINMAX(16, min, true, false, false)
2828 MINMAX(16, minnum, true, true, false)
2829 MINMAX(16, minnummag, true, true, true)
2830 MINMAX(16, max, false, false, false)
2831 MINMAX(16, maxnum, false, true, false)
2832 MINMAX(16, maxnummag, false, true, true)
2833 
2834 MINMAX(32, min, true, false, false)
2835 MINMAX(32, minnum, true, true, false)
2836 MINMAX(32, minnummag, true, true, true)
2837 MINMAX(32, max, false, false, false)
2838 MINMAX(32, maxnum, false, true, false)
2839 MINMAX(32, maxnummag, false, true, true)
2840 
2841 MINMAX(64, min, true, false, false)
2842 MINMAX(64, minnum, true, true, false)
2843 MINMAX(64, minnummag, true, true, true)
2844 MINMAX(64, max, false, false, false)
2845 MINMAX(64, maxnum, false, true, false)
2846 MINMAX(64, maxnummag, false, true, true)
2847 
2848 #undef MINMAX
2849 
2850 /* Floating point compare */
2851 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2852                                     float_status *s)
2853 {
2854     if (is_nan(a.cls) || is_nan(b.cls)) {
2855         if (!is_quiet ||
2856             a.cls == float_class_snan ||
2857             b.cls == float_class_snan) {
2858             s->float_exception_flags |= float_flag_invalid;
2859         }
2860         return float_relation_unordered;
2861     }
2862 
2863     if (a.cls == float_class_zero) {
2864         if (b.cls == float_class_zero) {
2865             return float_relation_equal;
2866         }
2867         return b.sign ? float_relation_greater : float_relation_less;
2868     } else if (b.cls == float_class_zero) {
2869         return a.sign ? float_relation_less : float_relation_greater;
2870     }
2871 
2872     /* The only really important thing about infinity is its sign. If
2873      * both are infinities the sign marks the smallest of the two.
2874      */
2875     if (a.cls == float_class_inf) {
2876         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2877             return float_relation_equal;
2878         }
2879         return a.sign ? float_relation_less : float_relation_greater;
2880     } else if (b.cls == float_class_inf) {
2881         return b.sign ? float_relation_greater : float_relation_less;
2882     }
2883 
2884     if (a.sign != b.sign) {
2885         return a.sign ? float_relation_less : float_relation_greater;
2886     }
2887 
2888     if (a.exp == b.exp) {
2889         if (a.frac == b.frac) {
2890             return float_relation_equal;
2891         }
2892         if (a.sign) {
2893             return a.frac > b.frac ?
2894                 float_relation_less : float_relation_greater;
2895         } else {
2896             return a.frac > b.frac ?
2897                 float_relation_greater : float_relation_less;
2898         }
2899     } else {
2900         if (a.sign) {
2901             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2902         } else {
2903             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2904         }
2905     }
2906 }
2907 
2908 #define COMPARE(name, attr, sz)                                         \
2909 static int attr                                                         \
2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2911 {                                                                       \
2912     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2913     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2914     return compare_floats(pa, pb, is_quiet, s);                         \
2915 }
2916 
2917 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2918 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2919 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2920 
2921 #undef COMPARE
2922 
2923 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
2924 {
2925     return soft_f16_compare(a, b, false, s);
2926 }
2927 
2928 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
2929 {
2930     return soft_f16_compare(a, b, true, s);
2931 }
2932 
2933 static FloatRelation QEMU_FLATTEN
2934 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2935 {
2936     union_float32 ua, ub;
2937 
2938     ua.s = xa;
2939     ub.s = xb;
2940 
2941     if (QEMU_NO_HARDFLOAT) {
2942         goto soft;
2943     }
2944 
2945     float32_input_flush2(&ua.s, &ub.s, s);
2946     if (isgreaterequal(ua.h, ub.h)) {
2947         if (isgreater(ua.h, ub.h)) {
2948             return float_relation_greater;
2949         }
2950         return float_relation_equal;
2951     }
2952     if (likely(isless(ua.h, ub.h))) {
2953         return float_relation_less;
2954     }
2955     /* The only condition remaining is unordered.
2956      * Fall through to set flags.
2957      */
2958  soft:
2959     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2960 }
2961 
2962 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
2963 {
2964     return f32_compare(a, b, false, s);
2965 }
2966 
2967 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
2968 {
2969     return f32_compare(a, b, true, s);
2970 }
2971 
2972 static FloatRelation QEMU_FLATTEN
2973 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2974 {
2975     union_float64 ua, ub;
2976 
2977     ua.s = xa;
2978     ub.s = xb;
2979 
2980     if (QEMU_NO_HARDFLOAT) {
2981         goto soft;
2982     }
2983 
2984     float64_input_flush2(&ua.s, &ub.s, s);
2985     if (isgreaterequal(ua.h, ub.h)) {
2986         if (isgreater(ua.h, ub.h)) {
2987             return float_relation_greater;
2988         }
2989         return float_relation_equal;
2990     }
2991     if (likely(isless(ua.h, ub.h))) {
2992         return float_relation_less;
2993     }
2994     /* The only condition remaining is unordered.
2995      * Fall through to set flags.
2996      */
2997  soft:
2998     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
2999 }
3000 
3001 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3002 {
3003     return f64_compare(a, b, false, s);
3004 }
3005 
3006 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3007 {
3008     return f64_compare(a, b, true, s);
3009 }
3010 
3011 /* Multiply A by 2 raised to the power N.  */
3012 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3013 {
3014     if (unlikely(is_nan(a.cls))) {
3015         return return_nan(a, s);
3016     }
3017     if (a.cls == float_class_normal) {
3018         /* The largest float type (even though not supported by FloatParts)
3019          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3020          * still allows rounding to infinity, without allowing overflow
3021          * within the int32_t that backs FloatParts.exp.
3022          */
3023         n = MIN(MAX(n, -0x10000), 0x10000);
3024         a.exp += n;
3025     }
3026     return a;
3027 }
3028 
3029 float16 float16_scalbn(float16 a, int n, float_status *status)
3030 {
3031     FloatParts pa = float16_unpack_canonical(a, status);
3032     FloatParts pr = scalbn_decomposed(pa, n, status);
3033     return float16_round_pack_canonical(pr, status);
3034 }
3035 
3036 float32 float32_scalbn(float32 a, int n, float_status *status)
3037 {
3038     FloatParts pa = float32_unpack_canonical(a, status);
3039     FloatParts pr = scalbn_decomposed(pa, n, status);
3040     return float32_round_pack_canonical(pr, status);
3041 }
3042 
3043 float64 float64_scalbn(float64 a, int n, float_status *status)
3044 {
3045     FloatParts pa = float64_unpack_canonical(a, status);
3046     FloatParts pr = scalbn_decomposed(pa, n, status);
3047     return float64_round_pack_canonical(pr, status);
3048 }
3049 
3050 /*
3051  * Square Root
3052  *
3053  * The old softfloat code did an approximation step before zeroing in
3054  * on the final result. However for simpleness we just compute the
3055  * square root by iterating down from the implicit bit to enough extra
3056  * bits to ensure we get a correctly rounded result.
3057  *
3058  * This does mean however the calculation is slower than before,
3059  * especially for 64 bit floats.
3060  */
3061 
3062 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3063 {
3064     uint64_t a_frac, r_frac, s_frac;
3065     int bit, last_bit;
3066 
3067     if (is_nan(a.cls)) {
3068         return return_nan(a, s);
3069     }
3070     if (a.cls == float_class_zero) {
3071         return a;  /* sqrt(+-0) = +-0 */
3072     }
3073     if (a.sign) {
3074         s->float_exception_flags |= float_flag_invalid;
3075         return parts_default_nan(s);
3076     }
3077     if (a.cls == float_class_inf) {
3078         return a;  /* sqrt(+inf) = +inf */
3079     }
3080 
3081     assert(a.cls == float_class_normal);
3082 
3083     /* We need two overflow bits at the top. Adding room for that is a
3084      * right shift. If the exponent is odd, we can discard the low bit
3085      * by multiplying the fraction by 2; that's a left shift. Combine
3086      * those and we shift right if the exponent is even.
3087      */
3088     a_frac = a.frac;
3089     if (!(a.exp & 1)) {
3090         a_frac >>= 1;
3091     }
3092     a.exp >>= 1;
3093 
3094     /* Bit-by-bit computation of sqrt.  */
3095     r_frac = 0;
3096     s_frac = 0;
3097 
3098     /* Iterate from implicit bit down to the 3 extra bits to compute a
3099      * properly rounded result. Remember we've inserted one more bit
3100      * at the top, so these positions are one less.
3101      */
3102     bit = DECOMPOSED_BINARY_POINT - 1;
3103     last_bit = MAX(p->frac_shift - 4, 0);
3104     do {
3105         uint64_t q = 1ULL << bit;
3106         uint64_t t_frac = s_frac + q;
3107         if (t_frac <= a_frac) {
3108             s_frac = t_frac + q;
3109             a_frac -= t_frac;
3110             r_frac += q;
3111         }
3112         a_frac <<= 1;
3113     } while (--bit >= last_bit);
3114 
3115     /* Undo the right shift done above. If there is any remaining
3116      * fraction, the result is inexact. Set the sticky bit.
3117      */
3118     a.frac = (r_frac << 1) + (a_frac != 0);
3119 
3120     return a;
3121 }
3122 
3123 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3124 {
3125     FloatParts pa = float16_unpack_canonical(a, status);
3126     FloatParts pr = sqrt_float(pa, status, &float16_params);
3127     return float16_round_pack_canonical(pr, status);
3128 }
3129 
3130 static float32 QEMU_SOFTFLOAT_ATTR
3131 soft_f32_sqrt(float32 a, float_status *status)
3132 {
3133     FloatParts pa = float32_unpack_canonical(a, status);
3134     FloatParts pr = sqrt_float(pa, status, &float32_params);
3135     return float32_round_pack_canonical(pr, status);
3136 }
3137 
3138 static float64 QEMU_SOFTFLOAT_ATTR
3139 soft_f64_sqrt(float64 a, float_status *status)
3140 {
3141     FloatParts pa = float64_unpack_canonical(a, status);
3142     FloatParts pr = sqrt_float(pa, status, &float64_params);
3143     return float64_round_pack_canonical(pr, status);
3144 }
3145 
3146 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3147 {
3148     union_float32 ua, ur;
3149 
3150     ua.s = xa;
3151     if (unlikely(!can_use_fpu(s))) {
3152         goto soft;
3153     }
3154 
3155     float32_input_flush1(&ua.s, s);
3156     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3157         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3158                        fpclassify(ua.h) == FP_ZERO) ||
3159                      signbit(ua.h))) {
3160             goto soft;
3161         }
3162     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3163                         float32_is_neg(ua.s))) {
3164         goto soft;
3165     }
3166     ur.h = sqrtf(ua.h);
3167     return ur.s;
3168 
3169  soft:
3170     return soft_f32_sqrt(ua.s, s);
3171 }
3172 
3173 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3174 {
3175     union_float64 ua, ur;
3176 
3177     ua.s = xa;
3178     if (unlikely(!can_use_fpu(s))) {
3179         goto soft;
3180     }
3181 
3182     float64_input_flush1(&ua.s, s);
3183     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3184         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3185                        fpclassify(ua.h) == FP_ZERO) ||
3186                      signbit(ua.h))) {
3187             goto soft;
3188         }
3189     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3190                         float64_is_neg(ua.s))) {
3191         goto soft;
3192     }
3193     ur.h = sqrt(ua.h);
3194     return ur.s;
3195 
3196  soft:
3197     return soft_f64_sqrt(ua.s, s);
3198 }
3199 
3200 /*----------------------------------------------------------------------------
3201 | The pattern for a default generated NaN.
3202 *----------------------------------------------------------------------------*/
3203 
3204 float16 float16_default_nan(float_status *status)
3205 {
3206     FloatParts p = parts_default_nan(status);
3207     p.frac >>= float16_params.frac_shift;
3208     return float16_pack_raw(p);
3209 }
3210 
3211 float32 float32_default_nan(float_status *status)
3212 {
3213     FloatParts p = parts_default_nan(status);
3214     p.frac >>= float32_params.frac_shift;
3215     return float32_pack_raw(p);
3216 }
3217 
3218 float64 float64_default_nan(float_status *status)
3219 {
3220     FloatParts p = parts_default_nan(status);
3221     p.frac >>= float64_params.frac_shift;
3222     return float64_pack_raw(p);
3223 }
3224 
3225 float128 float128_default_nan(float_status *status)
3226 {
3227     FloatParts p = parts_default_nan(status);
3228     float128 r;
3229 
3230     /* Extrapolate from the choices made by parts_default_nan to fill
3231      * in the quad-floating format.  If the low bit is set, assume we
3232      * want to set all non-snan bits.
3233      */
3234     r.low = -(p.frac & 1);
3235     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3236     r.high |= UINT64_C(0x7FFF000000000000);
3237     r.high |= (uint64_t)p.sign << 63;
3238 
3239     return r;
3240 }
3241 
3242 /*----------------------------------------------------------------------------
3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3244 *----------------------------------------------------------------------------*/
3245 
3246 float16 float16_silence_nan(float16 a, float_status *status)
3247 {
3248     FloatParts p = float16_unpack_raw(a);
3249     p.frac <<= float16_params.frac_shift;
3250     p = parts_silence_nan(p, status);
3251     p.frac >>= float16_params.frac_shift;
3252     return float16_pack_raw(p);
3253 }
3254 
3255 float32 float32_silence_nan(float32 a, float_status *status)
3256 {
3257     FloatParts p = float32_unpack_raw(a);
3258     p.frac <<= float32_params.frac_shift;
3259     p = parts_silence_nan(p, status);
3260     p.frac >>= float32_params.frac_shift;
3261     return float32_pack_raw(p);
3262 }
3263 
3264 float64 float64_silence_nan(float64 a, float_status *status)
3265 {
3266     FloatParts p = float64_unpack_raw(a);
3267     p.frac <<= float64_params.frac_shift;
3268     p = parts_silence_nan(p, status);
3269     p.frac >>= float64_params.frac_shift;
3270     return float64_pack_raw(p);
3271 }
3272 
3273 
3274 /*----------------------------------------------------------------------------
3275 | If `a' is denormal and we are in flush-to-zero mode then set the
3276 | input-denormal exception and return zero. Otherwise just return the value.
3277 *----------------------------------------------------------------------------*/
3278 
3279 static bool parts_squash_denormal(FloatParts p, float_status *status)
3280 {
3281     if (p.exp == 0 && p.frac != 0) {
3282         float_raise(float_flag_input_denormal, status);
3283         return true;
3284     }
3285 
3286     return false;
3287 }
3288 
3289 float16 float16_squash_input_denormal(float16 a, float_status *status)
3290 {
3291     if (status->flush_inputs_to_zero) {
3292         FloatParts p = float16_unpack_raw(a);
3293         if (parts_squash_denormal(p, status)) {
3294             return float16_set_sign(float16_zero, p.sign);
3295         }
3296     }
3297     return a;
3298 }
3299 
3300 float32 float32_squash_input_denormal(float32 a, float_status *status)
3301 {
3302     if (status->flush_inputs_to_zero) {
3303         FloatParts p = float32_unpack_raw(a);
3304         if (parts_squash_denormal(p, status)) {
3305             return float32_set_sign(float32_zero, p.sign);
3306         }
3307     }
3308     return a;
3309 }
3310 
3311 float64 float64_squash_input_denormal(float64 a, float_status *status)
3312 {
3313     if (status->flush_inputs_to_zero) {
3314         FloatParts p = float64_unpack_raw(a);
3315         if (parts_squash_denormal(p, status)) {
3316             return float64_set_sign(float64_zero, p.sign);
3317         }
3318     }
3319     return a;
3320 }
3321 
3322 /*----------------------------------------------------------------------------
3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3325 | input.  If `zSign' is 1, the input is negated before being converted to an
3326 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3327 | is simply rounded to an integer, with the inexact exception raised if the
3328 | input cannot be represented exactly as an integer.  However, if the fixed-
3329 | point input is too large, the invalid exception is raised and the largest
3330 | positive or negative integer is returned.
3331 *----------------------------------------------------------------------------*/
3332 
3333 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3334                                  float_status *status)
3335 {
3336     int8_t roundingMode;
3337     bool roundNearestEven;
3338     int8_t roundIncrement, roundBits;
3339     int32_t z;
3340 
3341     roundingMode = status->float_rounding_mode;
3342     roundNearestEven = ( roundingMode == float_round_nearest_even );
3343     switch (roundingMode) {
3344     case float_round_nearest_even:
3345     case float_round_ties_away:
3346         roundIncrement = 0x40;
3347         break;
3348     case float_round_to_zero:
3349         roundIncrement = 0;
3350         break;
3351     case float_round_up:
3352         roundIncrement = zSign ? 0 : 0x7f;
3353         break;
3354     case float_round_down:
3355         roundIncrement = zSign ? 0x7f : 0;
3356         break;
3357     case float_round_to_odd:
3358         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3359         break;
3360     default:
3361         abort();
3362     }
3363     roundBits = absZ & 0x7F;
3364     absZ = ( absZ + roundIncrement )>>7;
3365     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3366     z = absZ;
3367     if ( zSign ) z = - z;
3368     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3369         float_raise(float_flag_invalid, status);
3370         return zSign ? INT32_MIN : INT32_MAX;
3371     }
3372     if (roundBits) {
3373         status->float_exception_flags |= float_flag_inexact;
3374     }
3375     return z;
3376 
3377 }
3378 
3379 /*----------------------------------------------------------------------------
3380 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3381 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3382 | and returns the properly rounded 64-bit integer corresponding to the input.
3383 | If `zSign' is 1, the input is negated before being converted to an integer.
3384 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3385 | the inexact exception raised if the input cannot be represented exactly as
3386 | an integer.  However, if the fixed-point input is too large, the invalid
3387 | exception is raised and the largest positive or negative integer is
3388 | returned.
3389 *----------------------------------------------------------------------------*/
3390 
3391 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3392                                float_status *status)
3393 {
3394     int8_t roundingMode;
3395     bool roundNearestEven, increment;
3396     int64_t z;
3397 
3398     roundingMode = status->float_rounding_mode;
3399     roundNearestEven = ( roundingMode == float_round_nearest_even );
3400     switch (roundingMode) {
3401     case float_round_nearest_even:
3402     case float_round_ties_away:
3403         increment = ((int64_t) absZ1 < 0);
3404         break;
3405     case float_round_to_zero:
3406         increment = 0;
3407         break;
3408     case float_round_up:
3409         increment = !zSign && absZ1;
3410         break;
3411     case float_round_down:
3412         increment = zSign && absZ1;
3413         break;
3414     case float_round_to_odd:
3415         increment = !(absZ0 & 1) && absZ1;
3416         break;
3417     default:
3418         abort();
3419     }
3420     if ( increment ) {
3421         ++absZ0;
3422         if ( absZ0 == 0 ) goto overflow;
3423         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3424     }
3425     z = absZ0;
3426     if ( zSign ) z = - z;
3427     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3428  overflow:
3429         float_raise(float_flag_invalid, status);
3430         return zSign ? INT64_MIN : INT64_MAX;
3431     }
3432     if (absZ1) {
3433         status->float_exception_flags |= float_flag_inexact;
3434     }
3435     return z;
3436 
3437 }
3438 
3439 /*----------------------------------------------------------------------------
3440 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3441 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3442 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3443 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3444 | with the inexact exception raised if the input cannot be represented exactly
3445 | as an integer.  However, if the fixed-point input is too large, the invalid
3446 | exception is raised and the largest unsigned integer is returned.
3447 *----------------------------------------------------------------------------*/
3448 
3449 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3450                                 uint64_t absZ1, float_status *status)
3451 {
3452     int8_t roundingMode;
3453     bool roundNearestEven, increment;
3454 
3455     roundingMode = status->float_rounding_mode;
3456     roundNearestEven = (roundingMode == float_round_nearest_even);
3457     switch (roundingMode) {
3458     case float_round_nearest_even:
3459     case float_round_ties_away:
3460         increment = ((int64_t)absZ1 < 0);
3461         break;
3462     case float_round_to_zero:
3463         increment = 0;
3464         break;
3465     case float_round_up:
3466         increment = !zSign && absZ1;
3467         break;
3468     case float_round_down:
3469         increment = zSign && absZ1;
3470         break;
3471     case float_round_to_odd:
3472         increment = !(absZ0 & 1) && absZ1;
3473         break;
3474     default:
3475         abort();
3476     }
3477     if (increment) {
3478         ++absZ0;
3479         if (absZ0 == 0) {
3480             float_raise(float_flag_invalid, status);
3481             return UINT64_MAX;
3482         }
3483         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3484     }
3485 
3486     if (zSign && absZ0) {
3487         float_raise(float_flag_invalid, status);
3488         return 0;
3489     }
3490 
3491     if (absZ1) {
3492         status->float_exception_flags |= float_flag_inexact;
3493     }
3494     return absZ0;
3495 }
3496 
3497 /*----------------------------------------------------------------------------
3498 | Normalizes the subnormal single-precision floating-point value represented
3499 | by the denormalized significand `aSig'.  The normalized exponent and
3500 | significand are stored at the locations pointed to by `zExpPtr' and
3501 | `zSigPtr', respectively.
3502 *----------------------------------------------------------------------------*/
3503 
3504 static void
3505  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3506 {
3507     int8_t shiftCount;
3508 
3509     shiftCount = clz32(aSig) - 8;
3510     *zSigPtr = aSig<<shiftCount;
3511     *zExpPtr = 1 - shiftCount;
3512 
3513 }
3514 
3515 /*----------------------------------------------------------------------------
3516 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3517 | and significand `zSig', and returns the proper single-precision floating-
3518 | point value corresponding to the abstract input.  Ordinarily, the abstract
3519 | value is simply rounded and packed into the single-precision format, with
3520 | the inexact exception raised if the abstract input cannot be represented
3521 | exactly.  However, if the abstract value is too large, the overflow and
3522 | inexact exceptions are raised and an infinity or maximal finite value is
3523 | returned.  If the abstract value is too small, the input value is rounded to
3524 | a subnormal number, and the underflow and inexact exceptions are raised if
3525 | the abstract input cannot be represented exactly as a subnormal single-
3526 | precision floating-point number.
3527 |     The input significand `zSig' has its binary point between bits 30
3528 | and 29, which is 7 bits to the left of the usual location.  This shifted
3529 | significand must be normalized or smaller.  If `zSig' is not normalized,
3530 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3531 | and it must not require rounding.  In the usual case that `zSig' is
3532 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3533 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3534 | Binary Floating-Point Arithmetic.
3535 *----------------------------------------------------------------------------*/
3536 
3537 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3538                                    float_status *status)
3539 {
3540     int8_t roundingMode;
3541     bool roundNearestEven;
3542     int8_t roundIncrement, roundBits;
3543     bool isTiny;
3544 
3545     roundingMode = status->float_rounding_mode;
3546     roundNearestEven = ( roundingMode == float_round_nearest_even );
3547     switch (roundingMode) {
3548     case float_round_nearest_even:
3549     case float_round_ties_away:
3550         roundIncrement = 0x40;
3551         break;
3552     case float_round_to_zero:
3553         roundIncrement = 0;
3554         break;
3555     case float_round_up:
3556         roundIncrement = zSign ? 0 : 0x7f;
3557         break;
3558     case float_round_down:
3559         roundIncrement = zSign ? 0x7f : 0;
3560         break;
3561     case float_round_to_odd:
3562         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3563         break;
3564     default:
3565         abort();
3566         break;
3567     }
3568     roundBits = zSig & 0x7F;
3569     if ( 0xFD <= (uint16_t) zExp ) {
3570         if (    ( 0xFD < zExp )
3571              || (    ( zExp == 0xFD )
3572                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3573            ) {
3574             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3575                                    roundIncrement != 0;
3576             float_raise(float_flag_overflow | float_flag_inexact, status);
3577             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3578         }
3579         if ( zExp < 0 ) {
3580             if (status->flush_to_zero) {
3581                 float_raise(float_flag_output_denormal, status);
3582                 return packFloat32(zSign, 0, 0);
3583             }
3584             isTiny = status->tininess_before_rounding
3585                   || (zExp < -1)
3586                   || (zSig + roundIncrement < 0x80000000);
3587             shift32RightJamming( zSig, - zExp, &zSig );
3588             zExp = 0;
3589             roundBits = zSig & 0x7F;
3590             if (isTiny && roundBits) {
3591                 float_raise(float_flag_underflow, status);
3592             }
3593             if (roundingMode == float_round_to_odd) {
3594                 /*
3595                  * For round-to-odd case, the roundIncrement depends on
3596                  * zSig which just changed.
3597                  */
3598                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3599             }
3600         }
3601     }
3602     if (roundBits) {
3603         status->float_exception_flags |= float_flag_inexact;
3604     }
3605     zSig = ( zSig + roundIncrement )>>7;
3606     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3607     if ( zSig == 0 ) zExp = 0;
3608     return packFloat32( zSign, zExp, zSig );
3609 
3610 }
3611 
3612 /*----------------------------------------------------------------------------
3613 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3614 | and significand `zSig', and returns the proper single-precision floating-
3615 | point value corresponding to the abstract input.  This routine is just like
3616 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3617 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3618 | floating-point exponent.
3619 *----------------------------------------------------------------------------*/
3620 
3621 static float32
3622  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3623                               float_status *status)
3624 {
3625     int8_t shiftCount;
3626 
3627     shiftCount = clz32(zSig) - 1;
3628     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3629                                status);
3630 
3631 }
3632 
3633 /*----------------------------------------------------------------------------
3634 | Normalizes the subnormal double-precision floating-point value represented
3635 | by the denormalized significand `aSig'.  The normalized exponent and
3636 | significand are stored at the locations pointed to by `zExpPtr' and
3637 | `zSigPtr', respectively.
3638 *----------------------------------------------------------------------------*/
3639 
3640 static void
3641  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3642 {
3643     int8_t shiftCount;
3644 
3645     shiftCount = clz64(aSig) - 11;
3646     *zSigPtr = aSig<<shiftCount;
3647     *zExpPtr = 1 - shiftCount;
3648 
3649 }
3650 
3651 /*----------------------------------------------------------------------------
3652 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3653 | double-precision floating-point value, returning the result.  After being
3654 | shifted into the proper positions, the three fields are simply added
3655 | together to form the result.  This means that any integer portion of `zSig'
3656 | will be added into the exponent.  Since a properly normalized significand
3657 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3658 | than the desired result exponent whenever `zSig' is a complete, normalized
3659 | significand.
3660 *----------------------------------------------------------------------------*/
3661 
3662 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
3663 {
3664 
3665     return make_float64(
3666         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3667 
3668 }
3669 
3670 /*----------------------------------------------------------------------------
3671 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3672 | and significand `zSig', and returns the proper double-precision floating-
3673 | point value corresponding to the abstract input.  Ordinarily, the abstract
3674 | value is simply rounded and packed into the double-precision format, with
3675 | the inexact exception raised if the abstract input cannot be represented
3676 | exactly.  However, if the abstract value is too large, the overflow and
3677 | inexact exceptions are raised and an infinity or maximal finite value is
3678 | returned.  If the abstract value is too small, the input value is rounded to
3679 | a subnormal number, and the underflow and inexact exceptions are raised if
3680 | the abstract input cannot be represented exactly as a subnormal double-
3681 | precision floating-point number.
3682 |     The input significand `zSig' has its binary point between bits 62
3683 | and 61, which is 10 bits to the left of the usual location.  This shifted
3684 | significand must be normalized or smaller.  If `zSig' is not normalized,
3685 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3686 | and it must not require rounding.  In the usual case that `zSig' is
3687 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3688 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3689 | Binary Floating-Point Arithmetic.
3690 *----------------------------------------------------------------------------*/
3691 
3692 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3693                                    float_status *status)
3694 {
3695     int8_t roundingMode;
3696     bool roundNearestEven;
3697     int roundIncrement, roundBits;
3698     bool isTiny;
3699 
3700     roundingMode = status->float_rounding_mode;
3701     roundNearestEven = ( roundingMode == float_round_nearest_even );
3702     switch (roundingMode) {
3703     case float_round_nearest_even:
3704     case float_round_ties_away:
3705         roundIncrement = 0x200;
3706         break;
3707     case float_round_to_zero:
3708         roundIncrement = 0;
3709         break;
3710     case float_round_up:
3711         roundIncrement = zSign ? 0 : 0x3ff;
3712         break;
3713     case float_round_down:
3714         roundIncrement = zSign ? 0x3ff : 0;
3715         break;
3716     case float_round_to_odd:
3717         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3718         break;
3719     default:
3720         abort();
3721     }
3722     roundBits = zSig & 0x3FF;
3723     if ( 0x7FD <= (uint16_t) zExp ) {
3724         if (    ( 0x7FD < zExp )
3725              || (    ( zExp == 0x7FD )
3726                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3727            ) {
3728             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3729                                    roundIncrement != 0;
3730             float_raise(float_flag_overflow | float_flag_inexact, status);
3731             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3732         }
3733         if ( zExp < 0 ) {
3734             if (status->flush_to_zero) {
3735                 float_raise(float_flag_output_denormal, status);
3736                 return packFloat64(zSign, 0, 0);
3737             }
3738             isTiny = status->tininess_before_rounding
3739                   || (zExp < -1)
3740                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
3741             shift64RightJamming( zSig, - zExp, &zSig );
3742             zExp = 0;
3743             roundBits = zSig & 0x3FF;
3744             if (isTiny && roundBits) {
3745                 float_raise(float_flag_underflow, status);
3746             }
3747             if (roundingMode == float_round_to_odd) {
3748                 /*
3749                  * For round-to-odd case, the roundIncrement depends on
3750                  * zSig which just changed.
3751                  */
3752                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3753             }
3754         }
3755     }
3756     if (roundBits) {
3757         status->float_exception_flags |= float_flag_inexact;
3758     }
3759     zSig = ( zSig + roundIncrement )>>10;
3760     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3761     if ( zSig == 0 ) zExp = 0;
3762     return packFloat64( zSign, zExp, zSig );
3763 
3764 }
3765 
3766 /*----------------------------------------------------------------------------
3767 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3768 | and significand `zSig', and returns the proper double-precision floating-
3769 | point value corresponding to the abstract input.  This routine is just like
3770 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3771 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3772 | floating-point exponent.
3773 *----------------------------------------------------------------------------*/
3774 
3775 static float64
3776  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3777                               float_status *status)
3778 {
3779     int8_t shiftCount;
3780 
3781     shiftCount = clz64(zSig) - 1;
3782     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3783                                status);
3784 
3785 }
3786 
3787 /*----------------------------------------------------------------------------
3788 | Normalizes the subnormal extended double-precision floating-point value
3789 | represented by the denormalized significand `aSig'.  The normalized exponent
3790 | and significand are stored at the locations pointed to by `zExpPtr' and
3791 | `zSigPtr', respectively.
3792 *----------------------------------------------------------------------------*/
3793 
3794 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3795                                 uint64_t *zSigPtr)
3796 {
3797     int8_t shiftCount;
3798 
3799     shiftCount = clz64(aSig);
3800     *zSigPtr = aSig<<shiftCount;
3801     *zExpPtr = 1 - shiftCount;
3802 }
3803 
3804 /*----------------------------------------------------------------------------
3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3807 | and returns the proper extended double-precision floating-point value
3808 | corresponding to the abstract input.  Ordinarily, the abstract value is
3809 | rounded and packed into the extended double-precision format, with the
3810 | inexact exception raised if the abstract input cannot be represented
3811 | exactly.  However, if the abstract value is too large, the overflow and
3812 | inexact exceptions are raised and an infinity or maximal finite value is
3813 | returned.  If the abstract value is too small, the input value is rounded to
3814 | a subnormal number, and the underflow and inexact exceptions are raised if
3815 | the abstract input cannot be represented exactly as a subnormal extended
3816 | double-precision floating-point number.
3817 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3818 | number of bits as single or double precision, respectively.  Otherwise, the
3819 | result is rounded to the full precision of the extended double-precision
3820 | format.
3821 |     The input significand must be normalized or smaller.  If the input
3822 | significand is not normalized, `zExp' must be 0; in that case, the result
3823 | returned is a subnormal number, and it must not require rounding.  The
3824 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic.
3826 *----------------------------------------------------------------------------*/
3827 
3828 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
3829                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3830                               float_status *status)
3831 {
3832     int8_t roundingMode;
3833     bool roundNearestEven, increment, isTiny;
3834     int64_t roundIncrement, roundMask, roundBits;
3835 
3836     roundingMode = status->float_rounding_mode;
3837     roundNearestEven = ( roundingMode == float_round_nearest_even );
3838     if ( roundingPrecision == 80 ) goto precision80;
3839     if ( roundingPrecision == 64 ) {
3840         roundIncrement = UINT64_C(0x0000000000000400);
3841         roundMask = UINT64_C(0x00000000000007FF);
3842     }
3843     else if ( roundingPrecision == 32 ) {
3844         roundIncrement = UINT64_C(0x0000008000000000);
3845         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3846     }
3847     else {
3848         goto precision80;
3849     }
3850     zSig0 |= ( zSig1 != 0 );
3851     switch (roundingMode) {
3852     case float_round_nearest_even:
3853     case float_round_ties_away:
3854         break;
3855     case float_round_to_zero:
3856         roundIncrement = 0;
3857         break;
3858     case float_round_up:
3859         roundIncrement = zSign ? 0 : roundMask;
3860         break;
3861     case float_round_down:
3862         roundIncrement = zSign ? roundMask : 0;
3863         break;
3864     default:
3865         abort();
3866     }
3867     roundBits = zSig0 & roundMask;
3868     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3869         if (    ( 0x7FFE < zExp )
3870              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3871            ) {
3872             goto overflow;
3873         }
3874         if ( zExp <= 0 ) {
3875             if (status->flush_to_zero) {
3876                 float_raise(float_flag_output_denormal, status);
3877                 return packFloatx80(zSign, 0, 0);
3878             }
3879             isTiny = status->tininess_before_rounding
3880                   || (zExp < 0 )
3881                   || (zSig0 <= zSig0 + roundIncrement);
3882             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3883             zExp = 0;
3884             roundBits = zSig0 & roundMask;
3885             if (isTiny && roundBits) {
3886                 float_raise(float_flag_underflow, status);
3887             }
3888             if (roundBits) {
3889                 status->float_exception_flags |= float_flag_inexact;
3890             }
3891             zSig0 += roundIncrement;
3892             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3893             roundIncrement = roundMask + 1;
3894             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3895                 roundMask |= roundIncrement;
3896             }
3897             zSig0 &= ~ roundMask;
3898             return packFloatx80( zSign, zExp, zSig0 );
3899         }
3900     }
3901     if (roundBits) {
3902         status->float_exception_flags |= float_flag_inexact;
3903     }
3904     zSig0 += roundIncrement;
3905     if ( zSig0 < roundIncrement ) {
3906         ++zExp;
3907         zSig0 = UINT64_C(0x8000000000000000);
3908     }
3909     roundIncrement = roundMask + 1;
3910     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3911         roundMask |= roundIncrement;
3912     }
3913     zSig0 &= ~ roundMask;
3914     if ( zSig0 == 0 ) zExp = 0;
3915     return packFloatx80( zSign, zExp, zSig0 );
3916  precision80:
3917     switch (roundingMode) {
3918     case float_round_nearest_even:
3919     case float_round_ties_away:
3920         increment = ((int64_t)zSig1 < 0);
3921         break;
3922     case float_round_to_zero:
3923         increment = 0;
3924         break;
3925     case float_round_up:
3926         increment = !zSign && zSig1;
3927         break;
3928     case float_round_down:
3929         increment = zSign && zSig1;
3930         break;
3931     default:
3932         abort();
3933     }
3934     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3935         if (    ( 0x7FFE < zExp )
3936              || (    ( zExp == 0x7FFE )
3937                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3938                   && increment
3939                 )
3940            ) {
3941             roundMask = 0;
3942  overflow:
3943             float_raise(float_flag_overflow | float_flag_inexact, status);
3944             if (    ( roundingMode == float_round_to_zero )
3945                  || ( zSign && ( roundingMode == float_round_up ) )
3946                  || ( ! zSign && ( roundingMode == float_round_down ) )
3947                ) {
3948                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3949             }
3950             return packFloatx80(zSign,
3951                                 floatx80_infinity_high,
3952                                 floatx80_infinity_low);
3953         }
3954         if ( zExp <= 0 ) {
3955             isTiny = status->tininess_before_rounding
3956                   || (zExp < 0)
3957                   || !increment
3958                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
3959             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3960             zExp = 0;
3961             if (isTiny && zSig1) {
3962                 float_raise(float_flag_underflow, status);
3963             }
3964             if (zSig1) {
3965                 status->float_exception_flags |= float_flag_inexact;
3966             }
3967             switch (roundingMode) {
3968             case float_round_nearest_even:
3969             case float_round_ties_away:
3970                 increment = ((int64_t)zSig1 < 0);
3971                 break;
3972             case float_round_to_zero:
3973                 increment = 0;
3974                 break;
3975             case float_round_up:
3976                 increment = !zSign && zSig1;
3977                 break;
3978             case float_round_down:
3979                 increment = zSign && zSig1;
3980                 break;
3981             default:
3982                 abort();
3983             }
3984             if ( increment ) {
3985                 ++zSig0;
3986                 zSig0 &=
3987                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3988                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3989             }
3990             return packFloatx80( zSign, zExp, zSig0 );
3991         }
3992     }
3993     if (zSig1) {
3994         status->float_exception_flags |= float_flag_inexact;
3995     }
3996     if ( increment ) {
3997         ++zSig0;
3998         if ( zSig0 == 0 ) {
3999             ++zExp;
4000             zSig0 = UINT64_C(0x8000000000000000);
4001         }
4002         else {
4003             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4004         }
4005     }
4006     else {
4007         if ( zSig0 == 0 ) zExp = 0;
4008     }
4009     return packFloatx80( zSign, zExp, zSig0 );
4010 
4011 }
4012 
4013 /*----------------------------------------------------------------------------
4014 | Takes an abstract floating-point value having sign `zSign', exponent
4015 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4016 | and returns the proper extended double-precision floating-point value
4017 | corresponding to the abstract input.  This routine is just like
4018 | `roundAndPackFloatx80' except that the input significand does not have to be
4019 | normalized.
4020 *----------------------------------------------------------------------------*/
4021 
4022 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4023                                        bool zSign, int32_t zExp,
4024                                        uint64_t zSig0, uint64_t zSig1,
4025                                        float_status *status)
4026 {
4027     int8_t shiftCount;
4028 
4029     if ( zSig0 == 0 ) {
4030         zSig0 = zSig1;
4031         zSig1 = 0;
4032         zExp -= 64;
4033     }
4034     shiftCount = clz64(zSig0);
4035     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4036     zExp -= shiftCount;
4037     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4038                                 zSig0, zSig1, status);
4039 
4040 }
4041 
4042 /*----------------------------------------------------------------------------
4043 | Returns the least-significant 64 fraction bits of the quadruple-precision
4044 | floating-point value `a'.
4045 *----------------------------------------------------------------------------*/
4046 
4047 static inline uint64_t extractFloat128Frac1( float128 a )
4048 {
4049 
4050     return a.low;
4051 
4052 }
4053 
4054 /*----------------------------------------------------------------------------
4055 | Returns the most-significant 48 fraction bits of the quadruple-precision
4056 | floating-point value `a'.
4057 *----------------------------------------------------------------------------*/
4058 
4059 static inline uint64_t extractFloat128Frac0( float128 a )
4060 {
4061 
4062     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4063 
4064 }
4065 
4066 /*----------------------------------------------------------------------------
4067 | Returns the exponent bits of the quadruple-precision floating-point value
4068 | `a'.
4069 *----------------------------------------------------------------------------*/
4070 
4071 static inline int32_t extractFloat128Exp( float128 a )
4072 {
4073 
4074     return ( a.high>>48 ) & 0x7FFF;
4075 
4076 }
4077 
4078 /*----------------------------------------------------------------------------
4079 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4080 *----------------------------------------------------------------------------*/
4081 
4082 static inline bool extractFloat128Sign(float128 a)
4083 {
4084     return a.high >> 63;
4085 }
4086 
4087 /*----------------------------------------------------------------------------
4088 | Normalizes the subnormal quadruple-precision floating-point value
4089 | represented by the denormalized significand formed by the concatenation of
4090 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4091 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4092 | significand are stored at the location pointed to by `zSig0Ptr', and the
4093 | least significant 64 bits of the normalized significand are stored at the
4094 | location pointed to by `zSig1Ptr'.
4095 *----------------------------------------------------------------------------*/
4096 
4097 static void
4098  normalizeFloat128Subnormal(
4099      uint64_t aSig0,
4100      uint64_t aSig1,
4101      int32_t *zExpPtr,
4102      uint64_t *zSig0Ptr,
4103      uint64_t *zSig1Ptr
4104  )
4105 {
4106     int8_t shiftCount;
4107 
4108     if ( aSig0 == 0 ) {
4109         shiftCount = clz64(aSig1) - 15;
4110         if ( shiftCount < 0 ) {
4111             *zSig0Ptr = aSig1>>( - shiftCount );
4112             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4113         }
4114         else {
4115             *zSig0Ptr = aSig1<<shiftCount;
4116             *zSig1Ptr = 0;
4117         }
4118         *zExpPtr = - shiftCount - 63;
4119     }
4120     else {
4121         shiftCount = clz64(aSig0) - 15;
4122         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4123         *zExpPtr = 1 - shiftCount;
4124     }
4125 
4126 }
4127 
4128 /*----------------------------------------------------------------------------
4129 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4130 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4131 | floating-point value, returning the result.  After being shifted into the
4132 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4133 | added together to form the most significant 32 bits of the result.  This
4134 | means that any integer portion of `zSig0' will be added into the exponent.
4135 | Since a properly normalized significand will have an integer portion equal
4136 | to 1, the `zExp' input should be 1 less than the desired result exponent
4137 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4138 | significand.
4139 *----------------------------------------------------------------------------*/
4140 
4141 static inline float128
4142 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4143 {
4144     float128 z;
4145 
4146     z.low = zSig1;
4147     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4148     return z;
4149 }
4150 
4151 /*----------------------------------------------------------------------------
4152 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4153 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4154 | and `zSig2', and returns the proper quadruple-precision floating-point value
4155 | corresponding to the abstract input.  Ordinarily, the abstract value is
4156 | simply rounded and packed into the quadruple-precision format, with the
4157 | inexact exception raised if the abstract input cannot be represented
4158 | exactly.  However, if the abstract value is too large, the overflow and
4159 | inexact exceptions are raised and an infinity or maximal finite value is
4160 | returned.  If the abstract value is too small, the input value is rounded to
4161 | a subnormal number, and the underflow and inexact exceptions are raised if
4162 | the abstract input cannot be represented exactly as a subnormal quadruple-
4163 | precision floating-point number.
4164 |     The input significand must be normalized or smaller.  If the input
4165 | significand is not normalized, `zExp' must be 0; in that case, the result
4166 | returned is a subnormal number, and it must not require rounding.  In the
4167 | usual case that the input significand is normalized, `zExp' must be 1 less
4168 | than the ``true'' floating-point exponent.  The handling of underflow and
4169 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4170 *----------------------------------------------------------------------------*/
4171 
4172 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4173                                      uint64_t zSig0, uint64_t zSig1,
4174                                      uint64_t zSig2, float_status *status)
4175 {
4176     int8_t roundingMode;
4177     bool roundNearestEven, increment, isTiny;
4178 
4179     roundingMode = status->float_rounding_mode;
4180     roundNearestEven = ( roundingMode == float_round_nearest_even );
4181     switch (roundingMode) {
4182     case float_round_nearest_even:
4183     case float_round_ties_away:
4184         increment = ((int64_t)zSig2 < 0);
4185         break;
4186     case float_round_to_zero:
4187         increment = 0;
4188         break;
4189     case float_round_up:
4190         increment = !zSign && zSig2;
4191         break;
4192     case float_round_down:
4193         increment = zSign && zSig2;
4194         break;
4195     case float_round_to_odd:
4196         increment = !(zSig1 & 0x1) && zSig2;
4197         break;
4198     default:
4199         abort();
4200     }
4201     if ( 0x7FFD <= (uint32_t) zExp ) {
4202         if (    ( 0x7FFD < zExp )
4203              || (    ( zExp == 0x7FFD )
4204                   && eq128(
4205                          UINT64_C(0x0001FFFFFFFFFFFF),
4206                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4207                          zSig0,
4208                          zSig1
4209                      )
4210                   && increment
4211                 )
4212            ) {
4213             float_raise(float_flag_overflow | float_flag_inexact, status);
4214             if (    ( roundingMode == float_round_to_zero )
4215                  || ( zSign && ( roundingMode == float_round_up ) )
4216                  || ( ! zSign && ( roundingMode == float_round_down ) )
4217                  || (roundingMode == float_round_to_odd)
4218                ) {
4219                 return
4220                     packFloat128(
4221                         zSign,
4222                         0x7FFE,
4223                         UINT64_C(0x0000FFFFFFFFFFFF),
4224                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4225                     );
4226             }
4227             return packFloat128( zSign, 0x7FFF, 0, 0 );
4228         }
4229         if ( zExp < 0 ) {
4230             if (status->flush_to_zero) {
4231                 float_raise(float_flag_output_denormal, status);
4232                 return packFloat128(zSign, 0, 0, 0);
4233             }
4234             isTiny = status->tininess_before_rounding
4235                   || (zExp < -1)
4236                   || !increment
4237                   || lt128(zSig0, zSig1,
4238                            UINT64_C(0x0001FFFFFFFFFFFF),
4239                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4240             shift128ExtraRightJamming(
4241                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4242             zExp = 0;
4243             if (isTiny && zSig2) {
4244                 float_raise(float_flag_underflow, status);
4245             }
4246             switch (roundingMode) {
4247             case float_round_nearest_even:
4248             case float_round_ties_away:
4249                 increment = ((int64_t)zSig2 < 0);
4250                 break;
4251             case float_round_to_zero:
4252                 increment = 0;
4253                 break;
4254             case float_round_up:
4255                 increment = !zSign && zSig2;
4256                 break;
4257             case float_round_down:
4258                 increment = zSign && zSig2;
4259                 break;
4260             case float_round_to_odd:
4261                 increment = !(zSig1 & 0x1) && zSig2;
4262                 break;
4263             default:
4264                 abort();
4265             }
4266         }
4267     }
4268     if (zSig2) {
4269         status->float_exception_flags |= float_flag_inexact;
4270     }
4271     if ( increment ) {
4272         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4273         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4274     }
4275     else {
4276         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4277     }
4278     return packFloat128( zSign, zExp, zSig0, zSig1 );
4279 
4280 }
4281 
4282 /*----------------------------------------------------------------------------
4283 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4284 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4285 | returns the proper quadruple-precision floating-point value corresponding
4286 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4287 | except that the input significand has fewer bits and does not have to be
4288 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4289 | point exponent.
4290 *----------------------------------------------------------------------------*/
4291 
4292 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4293                                               uint64_t zSig0, uint64_t zSig1,
4294                                               float_status *status)
4295 {
4296     int8_t shiftCount;
4297     uint64_t zSig2;
4298 
4299     if ( zSig0 == 0 ) {
4300         zSig0 = zSig1;
4301         zSig1 = 0;
4302         zExp -= 64;
4303     }
4304     shiftCount = clz64(zSig0) - 15;
4305     if ( 0 <= shiftCount ) {
4306         zSig2 = 0;
4307         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4308     }
4309     else {
4310         shift128ExtraRightJamming(
4311             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4312     }
4313     zExp -= shiftCount;
4314     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4315 
4316 }
4317 
4318 
4319 /*----------------------------------------------------------------------------
4320 | Returns the result of converting the 32-bit two's complement integer `a'
4321 | to the extended double-precision floating-point format.  The conversion
4322 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4323 | Arithmetic.
4324 *----------------------------------------------------------------------------*/
4325 
4326 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4327 {
4328     bool zSign;
4329     uint32_t absA;
4330     int8_t shiftCount;
4331     uint64_t zSig;
4332 
4333     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4334     zSign = ( a < 0 );
4335     absA = zSign ? - a : a;
4336     shiftCount = clz32(absA) + 32;
4337     zSig = absA;
4338     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4339 
4340 }
4341 
4342 /*----------------------------------------------------------------------------
4343 | Returns the result of converting the 32-bit two's complement integer `a' to
4344 | the quadruple-precision floating-point format.  The conversion is performed
4345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4347 
4348 float128 int32_to_float128(int32_t a, float_status *status)
4349 {
4350     bool zSign;
4351     uint32_t absA;
4352     int8_t shiftCount;
4353     uint64_t zSig0;
4354 
4355     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4356     zSign = ( a < 0 );
4357     absA = zSign ? - a : a;
4358     shiftCount = clz32(absA) + 17;
4359     zSig0 = absA;
4360     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4361 
4362 }
4363 
4364 /*----------------------------------------------------------------------------
4365 | Returns the result of converting the 64-bit two's complement integer `a'
4366 | to the extended double-precision floating-point format.  The conversion
4367 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4368 | Arithmetic.
4369 *----------------------------------------------------------------------------*/
4370 
4371 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4372 {
4373     bool zSign;
4374     uint64_t absA;
4375     int8_t shiftCount;
4376 
4377     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4378     zSign = ( a < 0 );
4379     absA = zSign ? - a : a;
4380     shiftCount = clz64(absA);
4381     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4382 
4383 }
4384 
4385 /*----------------------------------------------------------------------------
4386 | Returns the result of converting the 64-bit two's complement integer `a' to
4387 | the quadruple-precision floating-point format.  The conversion is performed
4388 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4389 *----------------------------------------------------------------------------*/
4390 
4391 float128 int64_to_float128(int64_t a, float_status *status)
4392 {
4393     bool zSign;
4394     uint64_t absA;
4395     int8_t shiftCount;
4396     int32_t zExp;
4397     uint64_t zSig0, zSig1;
4398 
4399     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4400     zSign = ( a < 0 );
4401     absA = zSign ? - a : a;
4402     shiftCount = clz64(absA) + 49;
4403     zExp = 0x406E - shiftCount;
4404     if ( 64 <= shiftCount ) {
4405         zSig1 = 0;
4406         zSig0 = absA;
4407         shiftCount -= 64;
4408     }
4409     else {
4410         zSig1 = absA;
4411         zSig0 = 0;
4412     }
4413     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4414     return packFloat128( zSign, zExp, zSig0, zSig1 );
4415 
4416 }
4417 
4418 /*----------------------------------------------------------------------------
4419 | Returns the result of converting the 64-bit unsigned integer `a'
4420 | to the quadruple-precision floating-point format.  The conversion is performed
4421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4422 *----------------------------------------------------------------------------*/
4423 
4424 float128 uint64_to_float128(uint64_t a, float_status *status)
4425 {
4426     if (a == 0) {
4427         return float128_zero;
4428     }
4429     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4430 }
4431 
4432 /*----------------------------------------------------------------------------
4433 | Returns the result of converting the single-precision floating-point value
4434 | `a' to the extended double-precision floating-point format.  The conversion
4435 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4436 | Arithmetic.
4437 *----------------------------------------------------------------------------*/
4438 
4439 floatx80 float32_to_floatx80(float32 a, float_status *status)
4440 {
4441     bool aSign;
4442     int aExp;
4443     uint32_t aSig;
4444 
4445     a = float32_squash_input_denormal(a, status);
4446     aSig = extractFloat32Frac( a );
4447     aExp = extractFloat32Exp( a );
4448     aSign = extractFloat32Sign( a );
4449     if ( aExp == 0xFF ) {
4450         if (aSig) {
4451             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4452                                                status);
4453             return floatx80_silence_nan(res, status);
4454         }
4455         return packFloatx80(aSign,
4456                             floatx80_infinity_high,
4457                             floatx80_infinity_low);
4458     }
4459     if ( aExp == 0 ) {
4460         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4461         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4462     }
4463     aSig |= 0x00800000;
4464     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4465 
4466 }
4467 
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the single-precision floating-point value
4470 | `a' to the double-precision floating-point format.  The conversion is
4471 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4472 | Arithmetic.
4473 *----------------------------------------------------------------------------*/
4474 
4475 float128 float32_to_float128(float32 a, float_status *status)
4476 {
4477     bool aSign;
4478     int aExp;
4479     uint32_t aSig;
4480 
4481     a = float32_squash_input_denormal(a, status);
4482     aSig = extractFloat32Frac( a );
4483     aExp = extractFloat32Exp( a );
4484     aSign = extractFloat32Sign( a );
4485     if ( aExp == 0xFF ) {
4486         if (aSig) {
4487             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4488         }
4489         return packFloat128( aSign, 0x7FFF, 0, 0 );
4490     }
4491     if ( aExp == 0 ) {
4492         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4493         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4494         --aExp;
4495     }
4496     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4497 
4498 }
4499 
4500 /*----------------------------------------------------------------------------
4501 | Returns the remainder of the single-precision floating-point value `a'
4502 | with respect to the corresponding value `b'.  The operation is performed
4503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4504 *----------------------------------------------------------------------------*/
4505 
4506 float32 float32_rem(float32 a, float32 b, float_status *status)
4507 {
4508     bool aSign, zSign;
4509     int aExp, bExp, expDiff;
4510     uint32_t aSig, bSig;
4511     uint32_t q;
4512     uint64_t aSig64, bSig64, q64;
4513     uint32_t alternateASig;
4514     int32_t sigMean;
4515     a = float32_squash_input_denormal(a, status);
4516     b = float32_squash_input_denormal(b, status);
4517 
4518     aSig = extractFloat32Frac( a );
4519     aExp = extractFloat32Exp( a );
4520     aSign = extractFloat32Sign( a );
4521     bSig = extractFloat32Frac( b );
4522     bExp = extractFloat32Exp( b );
4523     if ( aExp == 0xFF ) {
4524         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4525             return propagateFloat32NaN(a, b, status);
4526         }
4527         float_raise(float_flag_invalid, status);
4528         return float32_default_nan(status);
4529     }
4530     if ( bExp == 0xFF ) {
4531         if (bSig) {
4532             return propagateFloat32NaN(a, b, status);
4533         }
4534         return a;
4535     }
4536     if ( bExp == 0 ) {
4537         if ( bSig == 0 ) {
4538             float_raise(float_flag_invalid, status);
4539             return float32_default_nan(status);
4540         }
4541         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4542     }
4543     if ( aExp == 0 ) {
4544         if ( aSig == 0 ) return a;
4545         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4546     }
4547     expDiff = aExp - bExp;
4548     aSig |= 0x00800000;
4549     bSig |= 0x00800000;
4550     if ( expDiff < 32 ) {
4551         aSig <<= 8;
4552         bSig <<= 8;
4553         if ( expDiff < 0 ) {
4554             if ( expDiff < -1 ) return a;
4555             aSig >>= 1;
4556         }
4557         q = ( bSig <= aSig );
4558         if ( q ) aSig -= bSig;
4559         if ( 0 < expDiff ) {
4560             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4561             q >>= 32 - expDiff;
4562             bSig >>= 2;
4563             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4564         }
4565         else {
4566             aSig >>= 2;
4567             bSig >>= 2;
4568         }
4569     }
4570     else {
4571         if ( bSig <= aSig ) aSig -= bSig;
4572         aSig64 = ( (uint64_t) aSig )<<40;
4573         bSig64 = ( (uint64_t) bSig )<<40;
4574         expDiff -= 64;
4575         while ( 0 < expDiff ) {
4576             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4577             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4578             aSig64 = - ( ( bSig * q64 )<<38 );
4579             expDiff -= 62;
4580         }
4581         expDiff += 64;
4582         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4583         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4584         q = q64>>( 64 - expDiff );
4585         bSig <<= 6;
4586         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4587     }
4588     do {
4589         alternateASig = aSig;
4590         ++q;
4591         aSig -= bSig;
4592     } while ( 0 <= (int32_t) aSig );
4593     sigMean = aSig + alternateASig;
4594     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4595         aSig = alternateASig;
4596     }
4597     zSign = ( (int32_t) aSig < 0 );
4598     if ( zSign ) aSig = - aSig;
4599     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4600 }
4601 
4602 
4603 
4604 /*----------------------------------------------------------------------------
4605 | Returns the binary exponential of the single-precision floating-point value
4606 | `a'. The operation is performed according to the IEC/IEEE Standard for
4607 | Binary Floating-Point Arithmetic.
4608 |
4609 | Uses the following identities:
4610 |
4611 | 1. -------------------------------------------------------------------------
4612 |      x    x*ln(2)
4613 |     2  = e
4614 |
4615 | 2. -------------------------------------------------------------------------
4616 |                      2     3     4     5           n
4617 |      x        x     x     x     x     x           x
4618 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4619 |               1!    2!    3!    4!    5!          n!
4620 *----------------------------------------------------------------------------*/
4621 
4622 static const float64 float32_exp2_coefficients[15] =
4623 {
4624     const_float64( 0x3ff0000000000000ll ), /*  1 */
4625     const_float64( 0x3fe0000000000000ll ), /*  2 */
4626     const_float64( 0x3fc5555555555555ll ), /*  3 */
4627     const_float64( 0x3fa5555555555555ll ), /*  4 */
4628     const_float64( 0x3f81111111111111ll ), /*  5 */
4629     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4630     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4631     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4632     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4633     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4634     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4635     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4636     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4637     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4638     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4639 };
4640 
4641 float32 float32_exp2(float32 a, float_status *status)
4642 {
4643     bool aSign;
4644     int aExp;
4645     uint32_t aSig;
4646     float64 r, x, xn;
4647     int i;
4648     a = float32_squash_input_denormal(a, status);
4649 
4650     aSig = extractFloat32Frac( a );
4651     aExp = extractFloat32Exp( a );
4652     aSign = extractFloat32Sign( a );
4653 
4654     if ( aExp == 0xFF) {
4655         if (aSig) {
4656             return propagateFloat32NaN(a, float32_zero, status);
4657         }
4658         return (aSign) ? float32_zero : a;
4659     }
4660     if (aExp == 0) {
4661         if (aSig == 0) return float32_one;
4662     }
4663 
4664     float_raise(float_flag_inexact, status);
4665 
4666     /* ******************************* */
4667     /* using float64 for approximation */
4668     /* ******************************* */
4669     x = float32_to_float64(a, status);
4670     x = float64_mul(x, float64_ln2, status);
4671 
4672     xn = x;
4673     r = float64_one;
4674     for (i = 0 ; i < 15 ; i++) {
4675         float64 f;
4676 
4677         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4678         r = float64_add(r, f, status);
4679 
4680         xn = float64_mul(xn, x, status);
4681     }
4682 
4683     return float64_to_float32(r, status);
4684 }
4685 
4686 /*----------------------------------------------------------------------------
4687 | Returns the binary log of the single-precision floating-point value `a'.
4688 | The operation is performed according to the IEC/IEEE Standard for Binary
4689 | Floating-Point Arithmetic.
4690 *----------------------------------------------------------------------------*/
4691 float32 float32_log2(float32 a, float_status *status)
4692 {
4693     bool aSign, zSign;
4694     int aExp;
4695     uint32_t aSig, zSig, i;
4696 
4697     a = float32_squash_input_denormal(a, status);
4698     aSig = extractFloat32Frac( a );
4699     aExp = extractFloat32Exp( a );
4700     aSign = extractFloat32Sign( a );
4701 
4702     if ( aExp == 0 ) {
4703         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4704         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4705     }
4706     if ( aSign ) {
4707         float_raise(float_flag_invalid, status);
4708         return float32_default_nan(status);
4709     }
4710     if ( aExp == 0xFF ) {
4711         if (aSig) {
4712             return propagateFloat32NaN(a, float32_zero, status);
4713         }
4714         return a;
4715     }
4716 
4717     aExp -= 0x7F;
4718     aSig |= 0x00800000;
4719     zSign = aExp < 0;
4720     zSig = aExp << 23;
4721 
4722     for (i = 1 << 22; i > 0; i >>= 1) {
4723         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4724         if ( aSig & 0x01000000 ) {
4725             aSig >>= 1;
4726             zSig |= i;
4727         }
4728     }
4729 
4730     if ( zSign )
4731         zSig = -zSig;
4732 
4733     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4734 }
4735 
4736 /*----------------------------------------------------------------------------
4737 | Returns the result of converting the double-precision floating-point value
4738 | `a' to the extended double-precision floating-point format.  The conversion
4739 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4740 | Arithmetic.
4741 *----------------------------------------------------------------------------*/
4742 
4743 floatx80 float64_to_floatx80(float64 a, float_status *status)
4744 {
4745     bool aSign;
4746     int aExp;
4747     uint64_t aSig;
4748 
4749     a = float64_squash_input_denormal(a, status);
4750     aSig = extractFloat64Frac( a );
4751     aExp = extractFloat64Exp( a );
4752     aSign = extractFloat64Sign( a );
4753     if ( aExp == 0x7FF ) {
4754         if (aSig) {
4755             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
4756                                                status);
4757             return floatx80_silence_nan(res, status);
4758         }
4759         return packFloatx80(aSign,
4760                             floatx80_infinity_high,
4761                             floatx80_infinity_low);
4762     }
4763     if ( aExp == 0 ) {
4764         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4765         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4766     }
4767     return
4768         packFloatx80(
4769             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
4770 
4771 }
4772 
4773 /*----------------------------------------------------------------------------
4774 | Returns the result of converting the double-precision floating-point value
4775 | `a' to the quadruple-precision floating-point format.  The conversion is
4776 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4777 | Arithmetic.
4778 *----------------------------------------------------------------------------*/
4779 
4780 float128 float64_to_float128(float64 a, float_status *status)
4781 {
4782     bool aSign;
4783     int aExp;
4784     uint64_t aSig, zSig0, zSig1;
4785 
4786     a = float64_squash_input_denormal(a, status);
4787     aSig = extractFloat64Frac( a );
4788     aExp = extractFloat64Exp( a );
4789     aSign = extractFloat64Sign( a );
4790     if ( aExp == 0x7FF ) {
4791         if (aSig) {
4792             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4793         }
4794         return packFloat128( aSign, 0x7FFF, 0, 0 );
4795     }
4796     if ( aExp == 0 ) {
4797         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4798         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4799         --aExp;
4800     }
4801     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4802     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4803 
4804 }
4805 
4806 
4807 /*----------------------------------------------------------------------------
4808 | Returns the remainder of the double-precision floating-point value `a'
4809 | with respect to the corresponding value `b'.  The operation is performed
4810 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4811 *----------------------------------------------------------------------------*/
4812 
4813 float64 float64_rem(float64 a, float64 b, float_status *status)
4814 {
4815     bool aSign, zSign;
4816     int aExp, bExp, expDiff;
4817     uint64_t aSig, bSig;
4818     uint64_t q, alternateASig;
4819     int64_t sigMean;
4820 
4821     a = float64_squash_input_denormal(a, status);
4822     b = float64_squash_input_denormal(b, status);
4823     aSig = extractFloat64Frac( a );
4824     aExp = extractFloat64Exp( a );
4825     aSign = extractFloat64Sign( a );
4826     bSig = extractFloat64Frac( b );
4827     bExp = extractFloat64Exp( b );
4828     if ( aExp == 0x7FF ) {
4829         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4830             return propagateFloat64NaN(a, b, status);
4831         }
4832         float_raise(float_flag_invalid, status);
4833         return float64_default_nan(status);
4834     }
4835     if ( bExp == 0x7FF ) {
4836         if (bSig) {
4837             return propagateFloat64NaN(a, b, status);
4838         }
4839         return a;
4840     }
4841     if ( bExp == 0 ) {
4842         if ( bSig == 0 ) {
4843             float_raise(float_flag_invalid, status);
4844             return float64_default_nan(status);
4845         }
4846         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4847     }
4848     if ( aExp == 0 ) {
4849         if ( aSig == 0 ) return a;
4850         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4851     }
4852     expDiff = aExp - bExp;
4853     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
4854     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
4855     if ( expDiff < 0 ) {
4856         if ( expDiff < -1 ) return a;
4857         aSig >>= 1;
4858     }
4859     q = ( bSig <= aSig );
4860     if ( q ) aSig -= bSig;
4861     expDiff -= 64;
4862     while ( 0 < expDiff ) {
4863         q = estimateDiv128To64( aSig, 0, bSig );
4864         q = ( 2 < q ) ? q - 2 : 0;
4865         aSig = - ( ( bSig>>2 ) * q );
4866         expDiff -= 62;
4867     }
4868     expDiff += 64;
4869     if ( 0 < expDiff ) {
4870         q = estimateDiv128To64( aSig, 0, bSig );
4871         q = ( 2 < q ) ? q - 2 : 0;
4872         q >>= 64 - expDiff;
4873         bSig >>= 2;
4874         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4875     }
4876     else {
4877         aSig >>= 2;
4878         bSig >>= 2;
4879     }
4880     do {
4881         alternateASig = aSig;
4882         ++q;
4883         aSig -= bSig;
4884     } while ( 0 <= (int64_t) aSig );
4885     sigMean = aSig + alternateASig;
4886     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4887         aSig = alternateASig;
4888     }
4889     zSign = ( (int64_t) aSig < 0 );
4890     if ( zSign ) aSig = - aSig;
4891     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4892 
4893 }
4894 
4895 /*----------------------------------------------------------------------------
4896 | Returns the binary log of the double-precision floating-point value `a'.
4897 | The operation is performed according to the IEC/IEEE Standard for Binary
4898 | Floating-Point Arithmetic.
4899 *----------------------------------------------------------------------------*/
4900 float64 float64_log2(float64 a, float_status *status)
4901 {
4902     bool aSign, zSign;
4903     int aExp;
4904     uint64_t aSig, aSig0, aSig1, zSig, i;
4905     a = float64_squash_input_denormal(a, status);
4906 
4907     aSig = extractFloat64Frac( a );
4908     aExp = extractFloat64Exp( a );
4909     aSign = extractFloat64Sign( a );
4910 
4911     if ( aExp == 0 ) {
4912         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4913         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4914     }
4915     if ( aSign ) {
4916         float_raise(float_flag_invalid, status);
4917         return float64_default_nan(status);
4918     }
4919     if ( aExp == 0x7FF ) {
4920         if (aSig) {
4921             return propagateFloat64NaN(a, float64_zero, status);
4922         }
4923         return a;
4924     }
4925 
4926     aExp -= 0x3FF;
4927     aSig |= UINT64_C(0x0010000000000000);
4928     zSign = aExp < 0;
4929     zSig = (uint64_t)aExp << 52;
4930     for (i = 1LL << 51; i > 0; i >>= 1) {
4931         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4932         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4933         if ( aSig & UINT64_C(0x0020000000000000) ) {
4934             aSig >>= 1;
4935             zSig |= i;
4936         }
4937     }
4938 
4939     if ( zSign )
4940         zSig = -zSig;
4941     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4942 }
4943 
4944 /*----------------------------------------------------------------------------
4945 | Returns the result of converting the extended double-precision floating-
4946 | point value `a' to the 32-bit two's complement integer format.  The
4947 | conversion is performed according to the IEC/IEEE Standard for Binary
4948 | Floating-Point Arithmetic---which means in particular that the conversion
4949 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4950 | largest positive integer is returned.  Otherwise, if the conversion
4951 | overflows, the largest integer with the same sign as `a' is returned.
4952 *----------------------------------------------------------------------------*/
4953 
4954 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4955 {
4956     bool aSign;
4957     int32_t aExp, shiftCount;
4958     uint64_t aSig;
4959 
4960     if (floatx80_invalid_encoding(a)) {
4961         float_raise(float_flag_invalid, status);
4962         return 1 << 31;
4963     }
4964     aSig = extractFloatx80Frac( a );
4965     aExp = extractFloatx80Exp( a );
4966     aSign = extractFloatx80Sign( a );
4967     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4968     shiftCount = 0x4037 - aExp;
4969     if ( shiftCount <= 0 ) shiftCount = 1;
4970     shift64RightJamming( aSig, shiftCount, &aSig );
4971     return roundAndPackInt32(aSign, aSig, status);
4972 
4973 }
4974 
4975 /*----------------------------------------------------------------------------
4976 | Returns the result of converting the extended double-precision floating-
4977 | point value `a' to the 32-bit two's complement integer format.  The
4978 | conversion is performed according to the IEC/IEEE Standard for Binary
4979 | Floating-Point Arithmetic, except that the conversion is always rounded
4980 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4981 | Otherwise, if the conversion overflows, the largest integer with the same
4982 | sign as `a' is returned.
4983 *----------------------------------------------------------------------------*/
4984 
4985 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4986 {
4987     bool aSign;
4988     int32_t aExp, shiftCount;
4989     uint64_t aSig, savedASig;
4990     int32_t z;
4991 
4992     if (floatx80_invalid_encoding(a)) {
4993         float_raise(float_flag_invalid, status);
4994         return 1 << 31;
4995     }
4996     aSig = extractFloatx80Frac( a );
4997     aExp = extractFloatx80Exp( a );
4998     aSign = extractFloatx80Sign( a );
4999     if ( 0x401E < aExp ) {
5000         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5001         goto invalid;
5002     }
5003     else if ( aExp < 0x3FFF ) {
5004         if (aExp || aSig) {
5005             status->float_exception_flags |= float_flag_inexact;
5006         }
5007         return 0;
5008     }
5009     shiftCount = 0x403E - aExp;
5010     savedASig = aSig;
5011     aSig >>= shiftCount;
5012     z = aSig;
5013     if ( aSign ) z = - z;
5014     if ( ( z < 0 ) ^ aSign ) {
5015  invalid:
5016         float_raise(float_flag_invalid, status);
5017         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5018     }
5019     if ( ( aSig<<shiftCount ) != savedASig ) {
5020         status->float_exception_flags |= float_flag_inexact;
5021     }
5022     return z;
5023 
5024 }
5025 
5026 /*----------------------------------------------------------------------------
5027 | Returns the result of converting the extended double-precision floating-
5028 | point value `a' to the 64-bit two's complement integer format.  The
5029 | conversion is performed according to the IEC/IEEE Standard for Binary
5030 | Floating-Point Arithmetic---which means in particular that the conversion
5031 | is rounded according to the current rounding mode.  If `a' is a NaN,
5032 | the largest positive integer is returned.  Otherwise, if the conversion
5033 | overflows, the largest integer with the same sign as `a' is returned.
5034 *----------------------------------------------------------------------------*/
5035 
5036 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5037 {
5038     bool aSign;
5039     int32_t aExp, shiftCount;
5040     uint64_t aSig, aSigExtra;
5041 
5042     if (floatx80_invalid_encoding(a)) {
5043         float_raise(float_flag_invalid, status);
5044         return 1ULL << 63;
5045     }
5046     aSig = extractFloatx80Frac( a );
5047     aExp = extractFloatx80Exp( a );
5048     aSign = extractFloatx80Sign( a );
5049     shiftCount = 0x403E - aExp;
5050     if ( shiftCount <= 0 ) {
5051         if ( shiftCount ) {
5052             float_raise(float_flag_invalid, status);
5053             if (!aSign || floatx80_is_any_nan(a)) {
5054                 return INT64_MAX;
5055             }
5056             return INT64_MIN;
5057         }
5058         aSigExtra = 0;
5059     }
5060     else {
5061         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5062     }
5063     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5064 
5065 }
5066 
5067 /*----------------------------------------------------------------------------
5068 | Returns the result of converting the extended double-precision floating-
5069 | point value `a' to the 64-bit two's complement integer format.  The
5070 | conversion is performed according to the IEC/IEEE Standard for Binary
5071 | Floating-Point Arithmetic, except that the conversion is always rounded
5072 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5073 | Otherwise, if the conversion overflows, the largest integer with the same
5074 | sign as `a' is returned.
5075 *----------------------------------------------------------------------------*/
5076 
5077 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5078 {
5079     bool aSign;
5080     int32_t aExp, shiftCount;
5081     uint64_t aSig;
5082     int64_t z;
5083 
5084     if (floatx80_invalid_encoding(a)) {
5085         float_raise(float_flag_invalid, status);
5086         return 1ULL << 63;
5087     }
5088     aSig = extractFloatx80Frac( a );
5089     aExp = extractFloatx80Exp( a );
5090     aSign = extractFloatx80Sign( a );
5091     shiftCount = aExp - 0x403E;
5092     if ( 0 <= shiftCount ) {
5093         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5094         if ( ( a.high != 0xC03E ) || aSig ) {
5095             float_raise(float_flag_invalid, status);
5096             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5097                 return INT64_MAX;
5098             }
5099         }
5100         return INT64_MIN;
5101     }
5102     else if ( aExp < 0x3FFF ) {
5103         if (aExp | aSig) {
5104             status->float_exception_flags |= float_flag_inexact;
5105         }
5106         return 0;
5107     }
5108     z = aSig>>( - shiftCount );
5109     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5110         status->float_exception_flags |= float_flag_inexact;
5111     }
5112     if ( aSign ) z = - z;
5113     return z;
5114 
5115 }
5116 
5117 /*----------------------------------------------------------------------------
5118 | Returns the result of converting the extended double-precision floating-
5119 | point value `a' to the single-precision floating-point format.  The
5120 | conversion is performed according to the IEC/IEEE Standard for Binary
5121 | Floating-Point Arithmetic.
5122 *----------------------------------------------------------------------------*/
5123 
5124 float32 floatx80_to_float32(floatx80 a, float_status *status)
5125 {
5126     bool aSign;
5127     int32_t aExp;
5128     uint64_t aSig;
5129 
5130     if (floatx80_invalid_encoding(a)) {
5131         float_raise(float_flag_invalid, status);
5132         return float32_default_nan(status);
5133     }
5134     aSig = extractFloatx80Frac( a );
5135     aExp = extractFloatx80Exp( a );
5136     aSign = extractFloatx80Sign( a );
5137     if ( aExp == 0x7FFF ) {
5138         if ( (uint64_t) ( aSig<<1 ) ) {
5139             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5140                                              status);
5141             return float32_silence_nan(res, status);
5142         }
5143         return packFloat32( aSign, 0xFF, 0 );
5144     }
5145     shift64RightJamming( aSig, 33, &aSig );
5146     if ( aExp || aSig ) aExp -= 0x3F81;
5147     return roundAndPackFloat32(aSign, aExp, aSig, status);
5148 
5149 }
5150 
5151 /*----------------------------------------------------------------------------
5152 | Returns the result of converting the extended double-precision floating-
5153 | point value `a' to the double-precision floating-point format.  The
5154 | conversion is performed according to the IEC/IEEE Standard for Binary
5155 | Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5157 
5158 float64 floatx80_to_float64(floatx80 a, float_status *status)
5159 {
5160     bool aSign;
5161     int32_t aExp;
5162     uint64_t aSig, zSig;
5163 
5164     if (floatx80_invalid_encoding(a)) {
5165         float_raise(float_flag_invalid, status);
5166         return float64_default_nan(status);
5167     }
5168     aSig = extractFloatx80Frac( a );
5169     aExp = extractFloatx80Exp( a );
5170     aSign = extractFloatx80Sign( a );
5171     if ( aExp == 0x7FFF ) {
5172         if ( (uint64_t) ( aSig<<1 ) ) {
5173             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5174                                              status);
5175             return float64_silence_nan(res, status);
5176         }
5177         return packFloat64( aSign, 0x7FF, 0 );
5178     }
5179     shift64RightJamming( aSig, 1, &zSig );
5180     if ( aExp || aSig ) aExp -= 0x3C01;
5181     return roundAndPackFloat64(aSign, aExp, zSig, status);
5182 
5183 }
5184 
5185 /*----------------------------------------------------------------------------
5186 | Returns the result of converting the extended double-precision floating-
5187 | point value `a' to the quadruple-precision floating-point format.  The
5188 | conversion is performed according to the IEC/IEEE Standard for Binary
5189 | Floating-Point Arithmetic.
5190 *----------------------------------------------------------------------------*/
5191 
5192 float128 floatx80_to_float128(floatx80 a, float_status *status)
5193 {
5194     bool aSign;
5195     int aExp;
5196     uint64_t aSig, zSig0, zSig1;
5197 
5198     if (floatx80_invalid_encoding(a)) {
5199         float_raise(float_flag_invalid, status);
5200         return float128_default_nan(status);
5201     }
5202     aSig = extractFloatx80Frac( a );
5203     aExp = extractFloatx80Exp( a );
5204     aSign = extractFloatx80Sign( a );
5205     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5206         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5207                                            status);
5208         return float128_silence_nan(res, status);
5209     }
5210     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5211     return packFloat128( aSign, aExp, zSig0, zSig1 );
5212 
5213 }
5214 
5215 /*----------------------------------------------------------------------------
5216 | Rounds the extended double-precision floating-point value `a'
5217 | to the precision provided by floatx80_rounding_precision and returns the
5218 | result as an extended double-precision floating-point value.
5219 | The operation is performed according to the IEC/IEEE Standard for Binary
5220 | Floating-Point Arithmetic.
5221 *----------------------------------------------------------------------------*/
5222 
5223 floatx80 floatx80_round(floatx80 a, float_status *status)
5224 {
5225     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5226                                 extractFloatx80Sign(a),
5227                                 extractFloatx80Exp(a),
5228                                 extractFloatx80Frac(a), 0, status);
5229 }
5230 
5231 /*----------------------------------------------------------------------------
5232 | Rounds the extended double-precision floating-point value `a' to an integer,
5233 | and returns the result as an extended quadruple-precision floating-point
5234 | value.  The operation is performed according to the IEC/IEEE Standard for
5235 | Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5239 {
5240     bool aSign;
5241     int32_t aExp;
5242     uint64_t lastBitMask, roundBitsMask;
5243     floatx80 z;
5244 
5245     if (floatx80_invalid_encoding(a)) {
5246         float_raise(float_flag_invalid, status);
5247         return floatx80_default_nan(status);
5248     }
5249     aExp = extractFloatx80Exp( a );
5250     if ( 0x403E <= aExp ) {
5251         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5252             return propagateFloatx80NaN(a, a, status);
5253         }
5254         return a;
5255     }
5256     if ( aExp < 0x3FFF ) {
5257         if (    ( aExp == 0 )
5258              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5259             return a;
5260         }
5261         status->float_exception_flags |= float_flag_inexact;
5262         aSign = extractFloatx80Sign( a );
5263         switch (status->float_rounding_mode) {
5264          case float_round_nearest_even:
5265             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5266                ) {
5267                 return
5268                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5269             }
5270             break;
5271         case float_round_ties_away:
5272             if (aExp == 0x3FFE) {
5273                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5274             }
5275             break;
5276          case float_round_down:
5277             return
5278                   aSign ?
5279                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5280                 : packFloatx80( 0, 0, 0 );
5281          case float_round_up:
5282             return
5283                   aSign ? packFloatx80( 1, 0, 0 )
5284                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5285 
5286         case float_round_to_zero:
5287             break;
5288         default:
5289             g_assert_not_reached();
5290         }
5291         return packFloatx80( aSign, 0, 0 );
5292     }
5293     lastBitMask = 1;
5294     lastBitMask <<= 0x403E - aExp;
5295     roundBitsMask = lastBitMask - 1;
5296     z = a;
5297     switch (status->float_rounding_mode) {
5298     case float_round_nearest_even:
5299         z.low += lastBitMask>>1;
5300         if ((z.low & roundBitsMask) == 0) {
5301             z.low &= ~lastBitMask;
5302         }
5303         break;
5304     case float_round_ties_away:
5305         z.low += lastBitMask >> 1;
5306         break;
5307     case float_round_to_zero:
5308         break;
5309     case float_round_up:
5310         if (!extractFloatx80Sign(z)) {
5311             z.low += roundBitsMask;
5312         }
5313         break;
5314     case float_round_down:
5315         if (extractFloatx80Sign(z)) {
5316             z.low += roundBitsMask;
5317         }
5318         break;
5319     default:
5320         abort();
5321     }
5322     z.low &= ~ roundBitsMask;
5323     if ( z.low == 0 ) {
5324         ++z.high;
5325         z.low = UINT64_C(0x8000000000000000);
5326     }
5327     if (z.low != a.low) {
5328         status->float_exception_flags |= float_flag_inexact;
5329     }
5330     return z;
5331 
5332 }
5333 
5334 /*----------------------------------------------------------------------------
5335 | Returns the result of adding the absolute values of the extended double-
5336 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5337 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5338 | The addition is performed according to the IEC/IEEE Standard for Binary
5339 | Floating-Point Arithmetic.
5340 *----------------------------------------------------------------------------*/
5341 
5342 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5343                                 float_status *status)
5344 {
5345     int32_t aExp, bExp, zExp;
5346     uint64_t aSig, bSig, zSig0, zSig1;
5347     int32_t expDiff;
5348 
5349     aSig = extractFloatx80Frac( a );
5350     aExp = extractFloatx80Exp( a );
5351     bSig = extractFloatx80Frac( b );
5352     bExp = extractFloatx80Exp( b );
5353     expDiff = aExp - bExp;
5354     if ( 0 < expDiff ) {
5355         if ( aExp == 0x7FFF ) {
5356             if ((uint64_t)(aSig << 1)) {
5357                 return propagateFloatx80NaN(a, b, status);
5358             }
5359             return a;
5360         }
5361         if ( bExp == 0 ) --expDiff;
5362         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5363         zExp = aExp;
5364     }
5365     else if ( expDiff < 0 ) {
5366         if ( bExp == 0x7FFF ) {
5367             if ((uint64_t)(bSig << 1)) {
5368                 return propagateFloatx80NaN(a, b, status);
5369             }
5370             return packFloatx80(zSign,
5371                                 floatx80_infinity_high,
5372                                 floatx80_infinity_low);
5373         }
5374         if ( aExp == 0 ) ++expDiff;
5375         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5376         zExp = bExp;
5377     }
5378     else {
5379         if ( aExp == 0x7FFF ) {
5380             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5381                 return propagateFloatx80NaN(a, b, status);
5382             }
5383             return a;
5384         }
5385         zSig1 = 0;
5386         zSig0 = aSig + bSig;
5387         if ( aExp == 0 ) {
5388             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5389                 /* At least one of the values is a pseudo-denormal,
5390                  * and there is a carry out of the result.  */
5391                 zExp = 1;
5392                 goto shiftRight1;
5393             }
5394             if (zSig0 == 0) {
5395                 return packFloatx80(zSign, 0, 0);
5396             }
5397             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5398             goto roundAndPack;
5399         }
5400         zExp = aExp;
5401         goto shiftRight1;
5402     }
5403     zSig0 = aSig + bSig;
5404     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5405  shiftRight1:
5406     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5407     zSig0 |= UINT64_C(0x8000000000000000);
5408     ++zExp;
5409  roundAndPack:
5410     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5411                                 zSign, zExp, zSig0, zSig1, status);
5412 }
5413 
5414 /*----------------------------------------------------------------------------
5415 | Returns the result of subtracting the absolute values of the extended
5416 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5417 | difference is negated before being returned.  `zSign' is ignored if the
5418 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5419 | Standard for Binary Floating-Point Arithmetic.
5420 *----------------------------------------------------------------------------*/
5421 
5422 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5423                                 float_status *status)
5424 {
5425     int32_t aExp, bExp, zExp;
5426     uint64_t aSig, bSig, zSig0, zSig1;
5427     int32_t expDiff;
5428 
5429     aSig = extractFloatx80Frac( a );
5430     aExp = extractFloatx80Exp( a );
5431     bSig = extractFloatx80Frac( b );
5432     bExp = extractFloatx80Exp( b );
5433     expDiff = aExp - bExp;
5434     if ( 0 < expDiff ) goto aExpBigger;
5435     if ( expDiff < 0 ) goto bExpBigger;
5436     if ( aExp == 0x7FFF ) {
5437         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5438             return propagateFloatx80NaN(a, b, status);
5439         }
5440         float_raise(float_flag_invalid, status);
5441         return floatx80_default_nan(status);
5442     }
5443     if ( aExp == 0 ) {
5444         aExp = 1;
5445         bExp = 1;
5446     }
5447     zSig1 = 0;
5448     if ( bSig < aSig ) goto aBigger;
5449     if ( aSig < bSig ) goto bBigger;
5450     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5451  bExpBigger:
5452     if ( bExp == 0x7FFF ) {
5453         if ((uint64_t)(bSig << 1)) {
5454             return propagateFloatx80NaN(a, b, status);
5455         }
5456         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5457                             floatx80_infinity_low);
5458     }
5459     if ( aExp == 0 ) ++expDiff;
5460     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5461  bBigger:
5462     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5463     zExp = bExp;
5464     zSign ^= 1;
5465     goto normalizeRoundAndPack;
5466  aExpBigger:
5467     if ( aExp == 0x7FFF ) {
5468         if ((uint64_t)(aSig << 1)) {
5469             return propagateFloatx80NaN(a, b, status);
5470         }
5471         return a;
5472     }
5473     if ( bExp == 0 ) --expDiff;
5474     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5475  aBigger:
5476     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5477     zExp = aExp;
5478  normalizeRoundAndPack:
5479     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5480                                          zSign, zExp, zSig0, zSig1, status);
5481 }
5482 
5483 /*----------------------------------------------------------------------------
5484 | Returns the result of adding the extended double-precision floating-point
5485 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5486 | Standard for Binary Floating-Point Arithmetic.
5487 *----------------------------------------------------------------------------*/
5488 
5489 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5490 {
5491     bool aSign, bSign;
5492 
5493     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5494         float_raise(float_flag_invalid, status);
5495         return floatx80_default_nan(status);
5496     }
5497     aSign = extractFloatx80Sign( a );
5498     bSign = extractFloatx80Sign( b );
5499     if ( aSign == bSign ) {
5500         return addFloatx80Sigs(a, b, aSign, status);
5501     }
5502     else {
5503         return subFloatx80Sigs(a, b, aSign, status);
5504     }
5505 
5506 }
5507 
5508 /*----------------------------------------------------------------------------
5509 | Returns the result of subtracting the extended double-precision floating-
5510 | point values `a' and `b'.  The operation is performed according to the
5511 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5512 *----------------------------------------------------------------------------*/
5513 
5514 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5515 {
5516     bool aSign, bSign;
5517 
5518     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5519         float_raise(float_flag_invalid, status);
5520         return floatx80_default_nan(status);
5521     }
5522     aSign = extractFloatx80Sign( a );
5523     bSign = extractFloatx80Sign( b );
5524     if ( aSign == bSign ) {
5525         return subFloatx80Sigs(a, b, aSign, status);
5526     }
5527     else {
5528         return addFloatx80Sigs(a, b, aSign, status);
5529     }
5530 
5531 }
5532 
5533 /*----------------------------------------------------------------------------
5534 | Returns the result of multiplying the extended double-precision floating-
5535 | point values `a' and `b'.  The operation is performed according to the
5536 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5537 *----------------------------------------------------------------------------*/
5538 
5539 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5540 {
5541     bool aSign, bSign, zSign;
5542     int32_t aExp, bExp, zExp;
5543     uint64_t aSig, bSig, zSig0, zSig1;
5544 
5545     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5546         float_raise(float_flag_invalid, status);
5547         return floatx80_default_nan(status);
5548     }
5549     aSig = extractFloatx80Frac( a );
5550     aExp = extractFloatx80Exp( a );
5551     aSign = extractFloatx80Sign( a );
5552     bSig = extractFloatx80Frac( b );
5553     bExp = extractFloatx80Exp( b );
5554     bSign = extractFloatx80Sign( b );
5555     zSign = aSign ^ bSign;
5556     if ( aExp == 0x7FFF ) {
5557         if (    (uint64_t) ( aSig<<1 )
5558              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5559             return propagateFloatx80NaN(a, b, status);
5560         }
5561         if ( ( bExp | bSig ) == 0 ) goto invalid;
5562         return packFloatx80(zSign, floatx80_infinity_high,
5563                                    floatx80_infinity_low);
5564     }
5565     if ( bExp == 0x7FFF ) {
5566         if ((uint64_t)(bSig << 1)) {
5567             return propagateFloatx80NaN(a, b, status);
5568         }
5569         if ( ( aExp | aSig ) == 0 ) {
5570  invalid:
5571             float_raise(float_flag_invalid, status);
5572             return floatx80_default_nan(status);
5573         }
5574         return packFloatx80(zSign, floatx80_infinity_high,
5575                                    floatx80_infinity_low);
5576     }
5577     if ( aExp == 0 ) {
5578         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5579         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5580     }
5581     if ( bExp == 0 ) {
5582         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5583         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5584     }
5585     zExp = aExp + bExp - 0x3FFE;
5586     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5587     if ( 0 < (int64_t) zSig0 ) {
5588         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5589         --zExp;
5590     }
5591     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5592                                 zSign, zExp, zSig0, zSig1, status);
5593 }
5594 
5595 /*----------------------------------------------------------------------------
5596 | Returns the result of dividing the extended double-precision floating-point
5597 | value `a' by the corresponding value `b'.  The operation is performed
5598 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5599 *----------------------------------------------------------------------------*/
5600 
5601 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5602 {
5603     bool aSign, bSign, zSign;
5604     int32_t aExp, bExp, zExp;
5605     uint64_t aSig, bSig, zSig0, zSig1;
5606     uint64_t rem0, rem1, rem2, term0, term1, term2;
5607 
5608     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5609         float_raise(float_flag_invalid, status);
5610         return floatx80_default_nan(status);
5611     }
5612     aSig = extractFloatx80Frac( a );
5613     aExp = extractFloatx80Exp( a );
5614     aSign = extractFloatx80Sign( a );
5615     bSig = extractFloatx80Frac( b );
5616     bExp = extractFloatx80Exp( b );
5617     bSign = extractFloatx80Sign( b );
5618     zSign = aSign ^ bSign;
5619     if ( aExp == 0x7FFF ) {
5620         if ((uint64_t)(aSig << 1)) {
5621             return propagateFloatx80NaN(a, b, status);
5622         }
5623         if ( bExp == 0x7FFF ) {
5624             if ((uint64_t)(bSig << 1)) {
5625                 return propagateFloatx80NaN(a, b, status);
5626             }
5627             goto invalid;
5628         }
5629         return packFloatx80(zSign, floatx80_infinity_high,
5630                                    floatx80_infinity_low);
5631     }
5632     if ( bExp == 0x7FFF ) {
5633         if ((uint64_t)(bSig << 1)) {
5634             return propagateFloatx80NaN(a, b, status);
5635         }
5636         return packFloatx80( zSign, 0, 0 );
5637     }
5638     if ( bExp == 0 ) {
5639         if ( bSig == 0 ) {
5640             if ( ( aExp | aSig ) == 0 ) {
5641  invalid:
5642                 float_raise(float_flag_invalid, status);
5643                 return floatx80_default_nan(status);
5644             }
5645             float_raise(float_flag_divbyzero, status);
5646             return packFloatx80(zSign, floatx80_infinity_high,
5647                                        floatx80_infinity_low);
5648         }
5649         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5650     }
5651     if ( aExp == 0 ) {
5652         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5653         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5654     }
5655     zExp = aExp - bExp + 0x3FFE;
5656     rem1 = 0;
5657     if ( bSig <= aSig ) {
5658         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5659         ++zExp;
5660     }
5661     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5662     mul64To128( bSig, zSig0, &term0, &term1 );
5663     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5664     while ( (int64_t) rem0 < 0 ) {
5665         --zSig0;
5666         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5667     }
5668     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5669     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5670         mul64To128( bSig, zSig1, &term1, &term2 );
5671         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5672         while ( (int64_t) rem1 < 0 ) {
5673             --zSig1;
5674             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5675         }
5676         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5677     }
5678     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5679                                 zSign, zExp, zSig0, zSig1, status);
5680 }
5681 
5682 /*----------------------------------------------------------------------------
5683 | Returns the remainder of the extended double-precision floating-point value
5684 | `a' with respect to the corresponding value `b'.  The operation is performed
5685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5686 *----------------------------------------------------------------------------*/
5687 
5688 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5689 {
5690     bool aSign, zSign;
5691     int32_t aExp, bExp, expDiff;
5692     uint64_t aSig0, aSig1, bSig;
5693     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5694 
5695     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5696         float_raise(float_flag_invalid, status);
5697         return floatx80_default_nan(status);
5698     }
5699     aSig0 = extractFloatx80Frac( a );
5700     aExp = extractFloatx80Exp( a );
5701     aSign = extractFloatx80Sign( a );
5702     bSig = extractFloatx80Frac( b );
5703     bExp = extractFloatx80Exp( b );
5704     if ( aExp == 0x7FFF ) {
5705         if (    (uint64_t) ( aSig0<<1 )
5706              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5707             return propagateFloatx80NaN(a, b, status);
5708         }
5709         goto invalid;
5710     }
5711     if ( bExp == 0x7FFF ) {
5712         if ((uint64_t)(bSig << 1)) {
5713             return propagateFloatx80NaN(a, b, status);
5714         }
5715         return a;
5716     }
5717     if ( bExp == 0 ) {
5718         if ( bSig == 0 ) {
5719  invalid:
5720             float_raise(float_flag_invalid, status);
5721             return floatx80_default_nan(status);
5722         }
5723         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5724     }
5725     if ( aExp == 0 ) {
5726         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5727         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5728     }
5729     bSig |= UINT64_C(0x8000000000000000);
5730     zSign = aSign;
5731     expDiff = aExp - bExp;
5732     aSig1 = 0;
5733     if ( expDiff < 0 ) {
5734         if ( expDiff < -1 ) return a;
5735         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5736         expDiff = 0;
5737     }
5738     q = ( bSig <= aSig0 );
5739     if ( q ) aSig0 -= bSig;
5740     expDiff -= 64;
5741     while ( 0 < expDiff ) {
5742         q = estimateDiv128To64( aSig0, aSig1, bSig );
5743         q = ( 2 < q ) ? q - 2 : 0;
5744         mul64To128( bSig, q, &term0, &term1 );
5745         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5746         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5747         expDiff -= 62;
5748     }
5749     expDiff += 64;
5750     if ( 0 < expDiff ) {
5751         q = estimateDiv128To64( aSig0, aSig1, bSig );
5752         q = ( 2 < q ) ? q - 2 : 0;
5753         q >>= 64 - expDiff;
5754         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5755         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5756         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5757         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5758             ++q;
5759             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5760         }
5761     }
5762     else {
5763         term1 = 0;
5764         term0 = bSig;
5765     }
5766     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5767     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5768          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5769               && ( q & 1 ) )
5770        ) {
5771         aSig0 = alternateASig0;
5772         aSig1 = alternateASig1;
5773         zSign = ! zSign;
5774     }
5775     return
5776         normalizeRoundAndPackFloatx80(
5777             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5778 
5779 }
5780 
5781 /*----------------------------------------------------------------------------
5782 | Returns the square root of the extended double-precision floating-point
5783 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5784 | for Binary Floating-Point Arithmetic.
5785 *----------------------------------------------------------------------------*/
5786 
5787 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5788 {
5789     bool aSign;
5790     int32_t aExp, zExp;
5791     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5792     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5793 
5794     if (floatx80_invalid_encoding(a)) {
5795         float_raise(float_flag_invalid, status);
5796         return floatx80_default_nan(status);
5797     }
5798     aSig0 = extractFloatx80Frac( a );
5799     aExp = extractFloatx80Exp( a );
5800     aSign = extractFloatx80Sign( a );
5801     if ( aExp == 0x7FFF ) {
5802         if ((uint64_t)(aSig0 << 1)) {
5803             return propagateFloatx80NaN(a, a, status);
5804         }
5805         if ( ! aSign ) return a;
5806         goto invalid;
5807     }
5808     if ( aSign ) {
5809         if ( ( aExp | aSig0 ) == 0 ) return a;
5810  invalid:
5811         float_raise(float_flag_invalid, status);
5812         return floatx80_default_nan(status);
5813     }
5814     if ( aExp == 0 ) {
5815         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5816         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5817     }
5818     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5819     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5820     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5821     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5822     doubleZSig0 = zSig0<<1;
5823     mul64To128( zSig0, zSig0, &term0, &term1 );
5824     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5825     while ( (int64_t) rem0 < 0 ) {
5826         --zSig0;
5827         doubleZSig0 -= 2;
5828         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5829     }
5830     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5831     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
5832         if ( zSig1 == 0 ) zSig1 = 1;
5833         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5834         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5835         mul64To128( zSig1, zSig1, &term2, &term3 );
5836         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5837         while ( (int64_t) rem1 < 0 ) {
5838             --zSig1;
5839             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5840             term3 |= 1;
5841             term2 |= doubleZSig0;
5842             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5843         }
5844         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5845     }
5846     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5847     zSig0 |= doubleZSig0;
5848     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5849                                 0, zExp, zSig0, zSig1, status);
5850 }
5851 
5852 /*----------------------------------------------------------------------------
5853 | Returns the result of converting the quadruple-precision floating-point
5854 | value `a' to the 32-bit two's complement integer format.  The conversion
5855 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5856 | Arithmetic---which means in particular that the conversion is rounded
5857 | according to the current rounding mode.  If `a' is a NaN, the largest
5858 | positive integer is returned.  Otherwise, if the conversion overflows, the
5859 | largest integer with the same sign as `a' is returned.
5860 *----------------------------------------------------------------------------*/
5861 
5862 int32_t float128_to_int32(float128 a, float_status *status)
5863 {
5864     bool aSign;
5865     int32_t aExp, shiftCount;
5866     uint64_t aSig0, aSig1;
5867 
5868     aSig1 = extractFloat128Frac1( a );
5869     aSig0 = extractFloat128Frac0( a );
5870     aExp = extractFloat128Exp( a );
5871     aSign = extractFloat128Sign( a );
5872     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5873     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5874     aSig0 |= ( aSig1 != 0 );
5875     shiftCount = 0x4028 - aExp;
5876     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5877     return roundAndPackInt32(aSign, aSig0, status);
5878 
5879 }
5880 
5881 /*----------------------------------------------------------------------------
5882 | Returns the result of converting the quadruple-precision floating-point
5883 | value `a' to the 32-bit two's complement integer format.  The conversion
5884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5885 | Arithmetic, except that the conversion is always rounded toward zero.  If
5886 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5887 | conversion overflows, the largest integer with the same sign as `a' is
5888 | returned.
5889 *----------------------------------------------------------------------------*/
5890 
5891 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5892 {
5893     bool aSign;
5894     int32_t aExp, shiftCount;
5895     uint64_t aSig0, aSig1, savedASig;
5896     int32_t z;
5897 
5898     aSig1 = extractFloat128Frac1( a );
5899     aSig0 = extractFloat128Frac0( a );
5900     aExp = extractFloat128Exp( a );
5901     aSign = extractFloat128Sign( a );
5902     aSig0 |= ( aSig1 != 0 );
5903     if ( 0x401E < aExp ) {
5904         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5905         goto invalid;
5906     }
5907     else if ( aExp < 0x3FFF ) {
5908         if (aExp || aSig0) {
5909             status->float_exception_flags |= float_flag_inexact;
5910         }
5911         return 0;
5912     }
5913     aSig0 |= UINT64_C(0x0001000000000000);
5914     shiftCount = 0x402F - aExp;
5915     savedASig = aSig0;
5916     aSig0 >>= shiftCount;
5917     z = aSig0;
5918     if ( aSign ) z = - z;
5919     if ( ( z < 0 ) ^ aSign ) {
5920  invalid:
5921         float_raise(float_flag_invalid, status);
5922         return aSign ? INT32_MIN : INT32_MAX;
5923     }
5924     if ( ( aSig0<<shiftCount ) != savedASig ) {
5925         status->float_exception_flags |= float_flag_inexact;
5926     }
5927     return z;
5928 
5929 }
5930 
5931 /*----------------------------------------------------------------------------
5932 | Returns the result of converting the quadruple-precision floating-point
5933 | value `a' to the 64-bit two's complement integer format.  The conversion
5934 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5935 | Arithmetic---which means in particular that the conversion is rounded
5936 | according to the current rounding mode.  If `a' is a NaN, the largest
5937 | positive integer is returned.  Otherwise, if the conversion overflows, the
5938 | largest integer with the same sign as `a' is returned.
5939 *----------------------------------------------------------------------------*/
5940 
5941 int64_t float128_to_int64(float128 a, float_status *status)
5942 {
5943     bool aSign;
5944     int32_t aExp, shiftCount;
5945     uint64_t aSig0, aSig1;
5946 
5947     aSig1 = extractFloat128Frac1( a );
5948     aSig0 = extractFloat128Frac0( a );
5949     aExp = extractFloat128Exp( a );
5950     aSign = extractFloat128Sign( a );
5951     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5952     shiftCount = 0x402F - aExp;
5953     if ( shiftCount <= 0 ) {
5954         if ( 0x403E < aExp ) {
5955             float_raise(float_flag_invalid, status);
5956             if (    ! aSign
5957                  || (    ( aExp == 0x7FFF )
5958                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
5959                     )
5960                ) {
5961                 return INT64_MAX;
5962             }
5963             return INT64_MIN;
5964         }
5965         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5966     }
5967     else {
5968         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5969     }
5970     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5971 
5972 }
5973 
5974 /*----------------------------------------------------------------------------
5975 | Returns the result of converting the quadruple-precision floating-point
5976 | value `a' to the 64-bit two's complement integer format.  The conversion
5977 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5978 | Arithmetic, except that the conversion is always rounded toward zero.
5979 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5980 | the conversion overflows, the largest integer with the same sign as `a' is
5981 | returned.
5982 *----------------------------------------------------------------------------*/
5983 
5984 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5985 {
5986     bool aSign;
5987     int32_t aExp, shiftCount;
5988     uint64_t aSig0, aSig1;
5989     int64_t z;
5990 
5991     aSig1 = extractFloat128Frac1( a );
5992     aSig0 = extractFloat128Frac0( a );
5993     aExp = extractFloat128Exp( a );
5994     aSign = extractFloat128Sign( a );
5995     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5996     shiftCount = aExp - 0x402F;
5997     if ( 0 < shiftCount ) {
5998         if ( 0x403E <= aExp ) {
5999             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6000             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6001                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6002                 if (aSig1) {
6003                     status->float_exception_flags |= float_flag_inexact;
6004                 }
6005             }
6006             else {
6007                 float_raise(float_flag_invalid, status);
6008                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6009                     return INT64_MAX;
6010                 }
6011             }
6012             return INT64_MIN;
6013         }
6014         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6015         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6016             status->float_exception_flags |= float_flag_inexact;
6017         }
6018     }
6019     else {
6020         if ( aExp < 0x3FFF ) {
6021             if ( aExp | aSig0 | aSig1 ) {
6022                 status->float_exception_flags |= float_flag_inexact;
6023             }
6024             return 0;
6025         }
6026         z = aSig0>>( - shiftCount );
6027         if (    aSig1
6028              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6029             status->float_exception_flags |= float_flag_inexact;
6030         }
6031     }
6032     if ( aSign ) z = - z;
6033     return z;
6034 
6035 }
6036 
6037 /*----------------------------------------------------------------------------
6038 | Returns the result of converting the quadruple-precision floating-point value
6039 | `a' to the 64-bit unsigned integer format.  The conversion is
6040 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6041 | Arithmetic---which means in particular that the conversion is rounded
6042 | according to the current rounding mode.  If `a' is a NaN, the largest
6043 | positive integer is returned.  If the conversion overflows, the
6044 | largest unsigned integer is returned.  If 'a' is negative, the value is
6045 | rounded and zero is returned; negative values that do not round to zero
6046 | will raise the inexact exception.
6047 *----------------------------------------------------------------------------*/
6048 
6049 uint64_t float128_to_uint64(float128 a, float_status *status)
6050 {
6051     bool aSign;
6052     int aExp;
6053     int shiftCount;
6054     uint64_t aSig0, aSig1;
6055 
6056     aSig0 = extractFloat128Frac0(a);
6057     aSig1 = extractFloat128Frac1(a);
6058     aExp = extractFloat128Exp(a);
6059     aSign = extractFloat128Sign(a);
6060     if (aSign && (aExp > 0x3FFE)) {
6061         float_raise(float_flag_invalid, status);
6062         if (float128_is_any_nan(a)) {
6063             return UINT64_MAX;
6064         } else {
6065             return 0;
6066         }
6067     }
6068     if (aExp) {
6069         aSig0 |= UINT64_C(0x0001000000000000);
6070     }
6071     shiftCount = 0x402F - aExp;
6072     if (shiftCount <= 0) {
6073         if (0x403E < aExp) {
6074             float_raise(float_flag_invalid, status);
6075             return UINT64_MAX;
6076         }
6077         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6078     } else {
6079         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6080     }
6081     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6082 }
6083 
6084 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6085 {
6086     uint64_t v;
6087     signed char current_rounding_mode = status->float_rounding_mode;
6088 
6089     set_float_rounding_mode(float_round_to_zero, status);
6090     v = float128_to_uint64(a, status);
6091     set_float_rounding_mode(current_rounding_mode, status);
6092 
6093     return v;
6094 }
6095 
6096 /*----------------------------------------------------------------------------
6097 | Returns the result of converting the quadruple-precision floating-point
6098 | value `a' to the 32-bit unsigned integer format.  The conversion
6099 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6100 | Arithmetic except that the conversion is always rounded toward zero.
6101 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6102 | if the conversion overflows, the largest unsigned integer is returned.
6103 | If 'a' is negative, the value is rounded and zero is returned; negative
6104 | values that do not round to zero will raise the inexact exception.
6105 *----------------------------------------------------------------------------*/
6106 
6107 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6108 {
6109     uint64_t v;
6110     uint32_t res;
6111     int old_exc_flags = get_float_exception_flags(status);
6112 
6113     v = float128_to_uint64_round_to_zero(a, status);
6114     if (v > 0xffffffff) {
6115         res = 0xffffffff;
6116     } else {
6117         return v;
6118     }
6119     set_float_exception_flags(old_exc_flags, status);
6120     float_raise(float_flag_invalid, status);
6121     return res;
6122 }
6123 
6124 /*----------------------------------------------------------------------------
6125 | Returns the result of converting the quadruple-precision floating-point value
6126 | `a' to the 32-bit unsigned integer format.  The conversion is
6127 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6128 | Arithmetic---which means in particular that the conversion is rounded
6129 | according to the current rounding mode.  If `a' is a NaN, the largest
6130 | positive integer is returned.  If the conversion overflows, the
6131 | largest unsigned integer is returned.  If 'a' is negative, the value is
6132 | rounded and zero is returned; negative values that do not round to zero
6133 | will raise the inexact exception.
6134 *----------------------------------------------------------------------------*/
6135 
6136 uint32_t float128_to_uint32(float128 a, float_status *status)
6137 {
6138     uint64_t v;
6139     uint32_t res;
6140     int old_exc_flags = get_float_exception_flags(status);
6141 
6142     v = float128_to_uint64(a, status);
6143     if (v > 0xffffffff) {
6144         res = 0xffffffff;
6145     } else {
6146         return v;
6147     }
6148     set_float_exception_flags(old_exc_flags, status);
6149     float_raise(float_flag_invalid, status);
6150     return res;
6151 }
6152 
6153 /*----------------------------------------------------------------------------
6154 | Returns the result of converting the quadruple-precision floating-point
6155 | value `a' to the single-precision floating-point format.  The conversion
6156 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6157 | Arithmetic.
6158 *----------------------------------------------------------------------------*/
6159 
6160 float32 float128_to_float32(float128 a, float_status *status)
6161 {
6162     bool aSign;
6163     int32_t aExp;
6164     uint64_t aSig0, aSig1;
6165     uint32_t zSig;
6166 
6167     aSig1 = extractFloat128Frac1( a );
6168     aSig0 = extractFloat128Frac0( a );
6169     aExp = extractFloat128Exp( a );
6170     aSign = extractFloat128Sign( a );
6171     if ( aExp == 0x7FFF ) {
6172         if ( aSig0 | aSig1 ) {
6173             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6174         }
6175         return packFloat32( aSign, 0xFF, 0 );
6176     }
6177     aSig0 |= ( aSig1 != 0 );
6178     shift64RightJamming( aSig0, 18, &aSig0 );
6179     zSig = aSig0;
6180     if ( aExp || zSig ) {
6181         zSig |= 0x40000000;
6182         aExp -= 0x3F81;
6183     }
6184     return roundAndPackFloat32(aSign, aExp, zSig, status);
6185 
6186 }
6187 
6188 /*----------------------------------------------------------------------------
6189 | Returns the result of converting the quadruple-precision floating-point
6190 | value `a' to the double-precision floating-point format.  The conversion
6191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6192 | Arithmetic.
6193 *----------------------------------------------------------------------------*/
6194 
6195 float64 float128_to_float64(float128 a, float_status *status)
6196 {
6197     bool aSign;
6198     int32_t aExp;
6199     uint64_t aSig0, aSig1;
6200 
6201     aSig1 = extractFloat128Frac1( a );
6202     aSig0 = extractFloat128Frac0( a );
6203     aExp = extractFloat128Exp( a );
6204     aSign = extractFloat128Sign( a );
6205     if ( aExp == 0x7FFF ) {
6206         if ( aSig0 | aSig1 ) {
6207             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6208         }
6209         return packFloat64( aSign, 0x7FF, 0 );
6210     }
6211     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6212     aSig0 |= ( aSig1 != 0 );
6213     if ( aExp || aSig0 ) {
6214         aSig0 |= UINT64_C(0x4000000000000000);
6215         aExp -= 0x3C01;
6216     }
6217     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6218 
6219 }
6220 
6221 /*----------------------------------------------------------------------------
6222 | Returns the result of converting the quadruple-precision floating-point
6223 | value `a' to the extended double-precision floating-point format.  The
6224 | conversion is performed according to the IEC/IEEE Standard for Binary
6225 | Floating-Point Arithmetic.
6226 *----------------------------------------------------------------------------*/
6227 
6228 floatx80 float128_to_floatx80(float128 a, float_status *status)
6229 {
6230     bool aSign;
6231     int32_t aExp;
6232     uint64_t aSig0, aSig1;
6233 
6234     aSig1 = extractFloat128Frac1( a );
6235     aSig0 = extractFloat128Frac0( a );
6236     aExp = extractFloat128Exp( a );
6237     aSign = extractFloat128Sign( a );
6238     if ( aExp == 0x7FFF ) {
6239         if ( aSig0 | aSig1 ) {
6240             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6241                                                status);
6242             return floatx80_silence_nan(res, status);
6243         }
6244         return packFloatx80(aSign, floatx80_infinity_high,
6245                                    floatx80_infinity_low);
6246     }
6247     if ( aExp == 0 ) {
6248         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6249         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6250     }
6251     else {
6252         aSig0 |= UINT64_C(0x0001000000000000);
6253     }
6254     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6255     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6256 
6257 }
6258 
6259 /*----------------------------------------------------------------------------
6260 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6261 | returns the result as a quadruple-precision floating-point value.  The
6262 | operation is performed according to the IEC/IEEE Standard for Binary
6263 | Floating-Point Arithmetic.
6264 *----------------------------------------------------------------------------*/
6265 
6266 float128 float128_round_to_int(float128 a, float_status *status)
6267 {
6268     bool aSign;
6269     int32_t aExp;
6270     uint64_t lastBitMask, roundBitsMask;
6271     float128 z;
6272 
6273     aExp = extractFloat128Exp( a );
6274     if ( 0x402F <= aExp ) {
6275         if ( 0x406F <= aExp ) {
6276             if (    ( aExp == 0x7FFF )
6277                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6278                ) {
6279                 return propagateFloat128NaN(a, a, status);
6280             }
6281             return a;
6282         }
6283         lastBitMask = 1;
6284         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6285         roundBitsMask = lastBitMask - 1;
6286         z = a;
6287         switch (status->float_rounding_mode) {
6288         case float_round_nearest_even:
6289             if ( lastBitMask ) {
6290                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6291                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6292             }
6293             else {
6294                 if ( (int64_t) z.low < 0 ) {
6295                     ++z.high;
6296                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6297                 }
6298             }
6299             break;
6300         case float_round_ties_away:
6301             if (lastBitMask) {
6302                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6303             } else {
6304                 if ((int64_t) z.low < 0) {
6305                     ++z.high;
6306                 }
6307             }
6308             break;
6309         case float_round_to_zero:
6310             break;
6311         case float_round_up:
6312             if (!extractFloat128Sign(z)) {
6313                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6314             }
6315             break;
6316         case float_round_down:
6317             if (extractFloat128Sign(z)) {
6318                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6319             }
6320             break;
6321         case float_round_to_odd:
6322             /*
6323              * Note that if lastBitMask == 0, the last bit is the lsb
6324              * of high, and roundBitsMask == -1.
6325              */
6326             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6327                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6328             }
6329             break;
6330         default:
6331             abort();
6332         }
6333         z.low &= ~ roundBitsMask;
6334     }
6335     else {
6336         if ( aExp < 0x3FFF ) {
6337             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6338             status->float_exception_flags |= float_flag_inexact;
6339             aSign = extractFloat128Sign( a );
6340             switch (status->float_rounding_mode) {
6341             case float_round_nearest_even:
6342                 if (    ( aExp == 0x3FFE )
6343                      && (   extractFloat128Frac0( a )
6344                           | extractFloat128Frac1( a ) )
6345                    ) {
6346                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6347                 }
6348                 break;
6349             case float_round_ties_away:
6350                 if (aExp == 0x3FFE) {
6351                     return packFloat128(aSign, 0x3FFF, 0, 0);
6352                 }
6353                 break;
6354             case float_round_down:
6355                 return
6356                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6357                     : packFloat128( 0, 0, 0, 0 );
6358             case float_round_up:
6359                 return
6360                       aSign ? packFloat128( 1, 0, 0, 0 )
6361                     : packFloat128( 0, 0x3FFF, 0, 0 );
6362 
6363             case float_round_to_odd:
6364                 return packFloat128(aSign, 0x3FFF, 0, 0);
6365 
6366             case float_round_to_zero:
6367                 break;
6368             }
6369             return packFloat128( aSign, 0, 0, 0 );
6370         }
6371         lastBitMask = 1;
6372         lastBitMask <<= 0x402F - aExp;
6373         roundBitsMask = lastBitMask - 1;
6374         z.low = 0;
6375         z.high = a.high;
6376         switch (status->float_rounding_mode) {
6377         case float_round_nearest_even:
6378             z.high += lastBitMask>>1;
6379             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6380                 z.high &= ~ lastBitMask;
6381             }
6382             break;
6383         case float_round_ties_away:
6384             z.high += lastBitMask>>1;
6385             break;
6386         case float_round_to_zero:
6387             break;
6388         case float_round_up:
6389             if (!extractFloat128Sign(z)) {
6390                 z.high |= ( a.low != 0 );
6391                 z.high += roundBitsMask;
6392             }
6393             break;
6394         case float_round_down:
6395             if (extractFloat128Sign(z)) {
6396                 z.high |= (a.low != 0);
6397                 z.high += roundBitsMask;
6398             }
6399             break;
6400         case float_round_to_odd:
6401             if ((z.high & lastBitMask) == 0) {
6402                 z.high |= (a.low != 0);
6403                 z.high += roundBitsMask;
6404             }
6405             break;
6406         default:
6407             abort();
6408         }
6409         z.high &= ~ roundBitsMask;
6410     }
6411     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6412         status->float_exception_flags |= float_flag_inexact;
6413     }
6414     return z;
6415 
6416 }
6417 
6418 /*----------------------------------------------------------------------------
6419 | Returns the result of adding the absolute values of the quadruple-precision
6420 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6421 | before being returned.  `zSign' is ignored if the result is a NaN.
6422 | The addition is performed according to the IEC/IEEE Standard for Binary
6423 | Floating-Point Arithmetic.
6424 *----------------------------------------------------------------------------*/
6425 
6426 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6427                                 float_status *status)
6428 {
6429     int32_t aExp, bExp, zExp;
6430     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6431     int32_t expDiff;
6432 
6433     aSig1 = extractFloat128Frac1( a );
6434     aSig0 = extractFloat128Frac0( a );
6435     aExp = extractFloat128Exp( a );
6436     bSig1 = extractFloat128Frac1( b );
6437     bSig0 = extractFloat128Frac0( b );
6438     bExp = extractFloat128Exp( b );
6439     expDiff = aExp - bExp;
6440     if ( 0 < expDiff ) {
6441         if ( aExp == 0x7FFF ) {
6442             if (aSig0 | aSig1) {
6443                 return propagateFloat128NaN(a, b, status);
6444             }
6445             return a;
6446         }
6447         if ( bExp == 0 ) {
6448             --expDiff;
6449         }
6450         else {
6451             bSig0 |= UINT64_C(0x0001000000000000);
6452         }
6453         shift128ExtraRightJamming(
6454             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6455         zExp = aExp;
6456     }
6457     else if ( expDiff < 0 ) {
6458         if ( bExp == 0x7FFF ) {
6459             if (bSig0 | bSig1) {
6460                 return propagateFloat128NaN(a, b, status);
6461             }
6462             return packFloat128( zSign, 0x7FFF, 0, 0 );
6463         }
6464         if ( aExp == 0 ) {
6465             ++expDiff;
6466         }
6467         else {
6468             aSig0 |= UINT64_C(0x0001000000000000);
6469         }
6470         shift128ExtraRightJamming(
6471             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6472         zExp = bExp;
6473     }
6474     else {
6475         if ( aExp == 0x7FFF ) {
6476             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6477                 return propagateFloat128NaN(a, b, status);
6478             }
6479             return a;
6480         }
6481         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6482         if ( aExp == 0 ) {
6483             if (status->flush_to_zero) {
6484                 if (zSig0 | zSig1) {
6485                     float_raise(float_flag_output_denormal, status);
6486                 }
6487                 return packFloat128(zSign, 0, 0, 0);
6488             }
6489             return packFloat128( zSign, 0, zSig0, zSig1 );
6490         }
6491         zSig2 = 0;
6492         zSig0 |= UINT64_C(0x0002000000000000);
6493         zExp = aExp;
6494         goto shiftRight1;
6495     }
6496     aSig0 |= UINT64_C(0x0001000000000000);
6497     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6498     --zExp;
6499     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6500     ++zExp;
6501  shiftRight1:
6502     shift128ExtraRightJamming(
6503         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6504  roundAndPack:
6505     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6506 
6507 }
6508 
6509 /*----------------------------------------------------------------------------
6510 | Returns the result of subtracting the absolute values of the quadruple-
6511 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6512 | difference is negated before being returned.  `zSign' is ignored if the
6513 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6514 | Standard for Binary Floating-Point Arithmetic.
6515 *----------------------------------------------------------------------------*/
6516 
6517 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6518                                 float_status *status)
6519 {
6520     int32_t aExp, bExp, zExp;
6521     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6522     int32_t expDiff;
6523 
6524     aSig1 = extractFloat128Frac1( a );
6525     aSig0 = extractFloat128Frac0( a );
6526     aExp = extractFloat128Exp( a );
6527     bSig1 = extractFloat128Frac1( b );
6528     bSig0 = extractFloat128Frac0( b );
6529     bExp = extractFloat128Exp( b );
6530     expDiff = aExp - bExp;
6531     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6532     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6533     if ( 0 < expDiff ) goto aExpBigger;
6534     if ( expDiff < 0 ) goto bExpBigger;
6535     if ( aExp == 0x7FFF ) {
6536         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6537             return propagateFloat128NaN(a, b, status);
6538         }
6539         float_raise(float_flag_invalid, status);
6540         return float128_default_nan(status);
6541     }
6542     if ( aExp == 0 ) {
6543         aExp = 1;
6544         bExp = 1;
6545     }
6546     if ( bSig0 < aSig0 ) goto aBigger;
6547     if ( aSig0 < bSig0 ) goto bBigger;
6548     if ( bSig1 < aSig1 ) goto aBigger;
6549     if ( aSig1 < bSig1 ) goto bBigger;
6550     return packFloat128(status->float_rounding_mode == float_round_down,
6551                         0, 0, 0);
6552  bExpBigger:
6553     if ( bExp == 0x7FFF ) {
6554         if (bSig0 | bSig1) {
6555             return propagateFloat128NaN(a, b, status);
6556         }
6557         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6558     }
6559     if ( aExp == 0 ) {
6560         ++expDiff;
6561     }
6562     else {
6563         aSig0 |= UINT64_C(0x4000000000000000);
6564     }
6565     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6566     bSig0 |= UINT64_C(0x4000000000000000);
6567  bBigger:
6568     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6569     zExp = bExp;
6570     zSign ^= 1;
6571     goto normalizeRoundAndPack;
6572  aExpBigger:
6573     if ( aExp == 0x7FFF ) {
6574         if (aSig0 | aSig1) {
6575             return propagateFloat128NaN(a, b, status);
6576         }
6577         return a;
6578     }
6579     if ( bExp == 0 ) {
6580         --expDiff;
6581     }
6582     else {
6583         bSig0 |= UINT64_C(0x4000000000000000);
6584     }
6585     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6586     aSig0 |= UINT64_C(0x4000000000000000);
6587  aBigger:
6588     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6589     zExp = aExp;
6590  normalizeRoundAndPack:
6591     --zExp;
6592     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6593                                          status);
6594 
6595 }
6596 
6597 /*----------------------------------------------------------------------------
6598 | Returns the result of adding the quadruple-precision floating-point values
6599 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6600 | for Binary Floating-Point Arithmetic.
6601 *----------------------------------------------------------------------------*/
6602 
6603 float128 float128_add(float128 a, float128 b, float_status *status)
6604 {
6605     bool aSign, bSign;
6606 
6607     aSign = extractFloat128Sign( a );
6608     bSign = extractFloat128Sign( b );
6609     if ( aSign == bSign ) {
6610         return addFloat128Sigs(a, b, aSign, status);
6611     }
6612     else {
6613         return subFloat128Sigs(a, b, aSign, status);
6614     }
6615 
6616 }
6617 
6618 /*----------------------------------------------------------------------------
6619 | Returns the result of subtracting the quadruple-precision floating-point
6620 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6621 | Standard for Binary Floating-Point Arithmetic.
6622 *----------------------------------------------------------------------------*/
6623 
6624 float128 float128_sub(float128 a, float128 b, float_status *status)
6625 {
6626     bool aSign, bSign;
6627 
6628     aSign = extractFloat128Sign( a );
6629     bSign = extractFloat128Sign( b );
6630     if ( aSign == bSign ) {
6631         return subFloat128Sigs(a, b, aSign, status);
6632     }
6633     else {
6634         return addFloat128Sigs(a, b, aSign, status);
6635     }
6636 
6637 }
6638 
6639 /*----------------------------------------------------------------------------
6640 | Returns the result of multiplying the quadruple-precision floating-point
6641 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6642 | Standard for Binary Floating-Point Arithmetic.
6643 *----------------------------------------------------------------------------*/
6644 
6645 float128 float128_mul(float128 a, float128 b, float_status *status)
6646 {
6647     bool aSign, bSign, zSign;
6648     int32_t aExp, bExp, zExp;
6649     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6650 
6651     aSig1 = extractFloat128Frac1( a );
6652     aSig0 = extractFloat128Frac0( a );
6653     aExp = extractFloat128Exp( a );
6654     aSign = extractFloat128Sign( a );
6655     bSig1 = extractFloat128Frac1( b );
6656     bSig0 = extractFloat128Frac0( b );
6657     bExp = extractFloat128Exp( b );
6658     bSign = extractFloat128Sign( b );
6659     zSign = aSign ^ bSign;
6660     if ( aExp == 0x7FFF ) {
6661         if (    ( aSig0 | aSig1 )
6662              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6663             return propagateFloat128NaN(a, b, status);
6664         }
6665         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6666         return packFloat128( zSign, 0x7FFF, 0, 0 );
6667     }
6668     if ( bExp == 0x7FFF ) {
6669         if (bSig0 | bSig1) {
6670             return propagateFloat128NaN(a, b, status);
6671         }
6672         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6673  invalid:
6674             float_raise(float_flag_invalid, status);
6675             return float128_default_nan(status);
6676         }
6677         return packFloat128( zSign, 0x7FFF, 0, 0 );
6678     }
6679     if ( aExp == 0 ) {
6680         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6681         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6682     }
6683     if ( bExp == 0 ) {
6684         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6685         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6686     }
6687     zExp = aExp + bExp - 0x4000;
6688     aSig0 |= UINT64_C(0x0001000000000000);
6689     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6690     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6691     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6692     zSig2 |= ( zSig3 != 0 );
6693     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
6694         shift128ExtraRightJamming(
6695             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6696         ++zExp;
6697     }
6698     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6699 
6700 }
6701 
6702 /*----------------------------------------------------------------------------
6703 | Returns the result of dividing the quadruple-precision floating-point value
6704 | `a' by the corresponding value `b'.  The operation is performed according to
6705 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6706 *----------------------------------------------------------------------------*/
6707 
6708 float128 float128_div(float128 a, float128 b, float_status *status)
6709 {
6710     bool aSign, bSign, zSign;
6711     int32_t aExp, bExp, zExp;
6712     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6713     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6714 
6715     aSig1 = extractFloat128Frac1( a );
6716     aSig0 = extractFloat128Frac0( a );
6717     aExp = extractFloat128Exp( a );
6718     aSign = extractFloat128Sign( a );
6719     bSig1 = extractFloat128Frac1( b );
6720     bSig0 = extractFloat128Frac0( b );
6721     bExp = extractFloat128Exp( b );
6722     bSign = extractFloat128Sign( b );
6723     zSign = aSign ^ bSign;
6724     if ( aExp == 0x7FFF ) {
6725         if (aSig0 | aSig1) {
6726             return propagateFloat128NaN(a, b, status);
6727         }
6728         if ( bExp == 0x7FFF ) {
6729             if (bSig0 | bSig1) {
6730                 return propagateFloat128NaN(a, b, status);
6731             }
6732             goto invalid;
6733         }
6734         return packFloat128( zSign, 0x7FFF, 0, 0 );
6735     }
6736     if ( bExp == 0x7FFF ) {
6737         if (bSig0 | bSig1) {
6738             return propagateFloat128NaN(a, b, status);
6739         }
6740         return packFloat128( zSign, 0, 0, 0 );
6741     }
6742     if ( bExp == 0 ) {
6743         if ( ( bSig0 | bSig1 ) == 0 ) {
6744             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6745  invalid:
6746                 float_raise(float_flag_invalid, status);
6747                 return float128_default_nan(status);
6748             }
6749             float_raise(float_flag_divbyzero, status);
6750             return packFloat128( zSign, 0x7FFF, 0, 0 );
6751         }
6752         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6753     }
6754     if ( aExp == 0 ) {
6755         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6756         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6757     }
6758     zExp = aExp - bExp + 0x3FFD;
6759     shortShift128Left(
6760         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
6761     shortShift128Left(
6762         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6763     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6764         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6765         ++zExp;
6766     }
6767     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6768     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6769     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6770     while ( (int64_t) rem0 < 0 ) {
6771         --zSig0;
6772         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6773     }
6774     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6775     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6776         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6777         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6778         while ( (int64_t) rem1 < 0 ) {
6779             --zSig1;
6780             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6781         }
6782         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6783     }
6784     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6785     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6786 
6787 }
6788 
6789 /*----------------------------------------------------------------------------
6790 | Returns the remainder of the quadruple-precision floating-point value `a'
6791 | with respect to the corresponding value `b'.  The operation is performed
6792 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6793 *----------------------------------------------------------------------------*/
6794 
6795 float128 float128_rem(float128 a, float128 b, float_status *status)
6796 {
6797     bool aSign, zSign;
6798     int32_t aExp, bExp, expDiff;
6799     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6800     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6801     int64_t sigMean0;
6802 
6803     aSig1 = extractFloat128Frac1( a );
6804     aSig0 = extractFloat128Frac0( a );
6805     aExp = extractFloat128Exp( a );
6806     aSign = extractFloat128Sign( a );
6807     bSig1 = extractFloat128Frac1( b );
6808     bSig0 = extractFloat128Frac0( b );
6809     bExp = extractFloat128Exp( b );
6810     if ( aExp == 0x7FFF ) {
6811         if (    ( aSig0 | aSig1 )
6812              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6813             return propagateFloat128NaN(a, b, status);
6814         }
6815         goto invalid;
6816     }
6817     if ( bExp == 0x7FFF ) {
6818         if (bSig0 | bSig1) {
6819             return propagateFloat128NaN(a, b, status);
6820         }
6821         return a;
6822     }
6823     if ( bExp == 0 ) {
6824         if ( ( bSig0 | bSig1 ) == 0 ) {
6825  invalid:
6826             float_raise(float_flag_invalid, status);
6827             return float128_default_nan(status);
6828         }
6829         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6830     }
6831     if ( aExp == 0 ) {
6832         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6833         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6834     }
6835     expDiff = aExp - bExp;
6836     if ( expDiff < -1 ) return a;
6837     shortShift128Left(
6838         aSig0 | UINT64_C(0x0001000000000000),
6839         aSig1,
6840         15 - ( expDiff < 0 ),
6841         &aSig0,
6842         &aSig1
6843     );
6844     shortShift128Left(
6845         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6846     q = le128( bSig0, bSig1, aSig0, aSig1 );
6847     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6848     expDiff -= 64;
6849     while ( 0 < expDiff ) {
6850         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6851         q = ( 4 < q ) ? q - 4 : 0;
6852         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6853         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6854         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6855         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6856         expDiff -= 61;
6857     }
6858     if ( -64 < expDiff ) {
6859         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6860         q = ( 4 < q ) ? q - 4 : 0;
6861         q >>= - expDiff;
6862         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6863         expDiff += 52;
6864         if ( expDiff < 0 ) {
6865             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6866         }
6867         else {
6868             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6869         }
6870         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6871         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6872     }
6873     else {
6874         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6875         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6876     }
6877     do {
6878         alternateASig0 = aSig0;
6879         alternateASig1 = aSig1;
6880         ++q;
6881         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6882     } while ( 0 <= (int64_t) aSig0 );
6883     add128(
6884         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6885     if (    ( sigMean0 < 0 )
6886          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6887         aSig0 = alternateASig0;
6888         aSig1 = alternateASig1;
6889     }
6890     zSign = ( (int64_t) aSig0 < 0 );
6891     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6892     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6893                                          status);
6894 }
6895 
6896 /*----------------------------------------------------------------------------
6897 | Returns the square root of the quadruple-precision floating-point value `a'.
6898 | The operation is performed according to the IEC/IEEE Standard for Binary
6899 | Floating-Point Arithmetic.
6900 *----------------------------------------------------------------------------*/
6901 
6902 float128 float128_sqrt(float128 a, float_status *status)
6903 {
6904     bool aSign;
6905     int32_t aExp, zExp;
6906     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6907     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6908 
6909     aSig1 = extractFloat128Frac1( a );
6910     aSig0 = extractFloat128Frac0( a );
6911     aExp = extractFloat128Exp( a );
6912     aSign = extractFloat128Sign( a );
6913     if ( aExp == 0x7FFF ) {
6914         if (aSig0 | aSig1) {
6915             return propagateFloat128NaN(a, a, status);
6916         }
6917         if ( ! aSign ) return a;
6918         goto invalid;
6919     }
6920     if ( aSign ) {
6921         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6922  invalid:
6923         float_raise(float_flag_invalid, status);
6924         return float128_default_nan(status);
6925     }
6926     if ( aExp == 0 ) {
6927         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6928         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6929     }
6930     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6931     aSig0 |= UINT64_C(0x0001000000000000);
6932     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6933     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6934     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6935     doubleZSig0 = zSig0<<1;
6936     mul64To128( zSig0, zSig0, &term0, &term1 );
6937     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6938     while ( (int64_t) rem0 < 0 ) {
6939         --zSig0;
6940         doubleZSig0 -= 2;
6941         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6942     }
6943     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6944     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6945         if ( zSig1 == 0 ) zSig1 = 1;
6946         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6947         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6948         mul64To128( zSig1, zSig1, &term2, &term3 );
6949         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6950         while ( (int64_t) rem1 < 0 ) {
6951             --zSig1;
6952             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6953             term3 |= 1;
6954             term2 |= doubleZSig0;
6955             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6956         }
6957         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6958     }
6959     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6960     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6961 
6962 }
6963 
6964 static inline FloatRelation
6965 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
6966                           float_status *status)
6967 {
6968     bool aSign, bSign;
6969 
6970     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6971         float_raise(float_flag_invalid, status);
6972         return float_relation_unordered;
6973     }
6974     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6975           ( extractFloatx80Frac( a )<<1 ) ) ||
6976         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6977           ( extractFloatx80Frac( b )<<1 ) )) {
6978         if (!is_quiet ||
6979             floatx80_is_signaling_nan(a, status) ||
6980             floatx80_is_signaling_nan(b, status)) {
6981             float_raise(float_flag_invalid, status);
6982         }
6983         return float_relation_unordered;
6984     }
6985     aSign = extractFloatx80Sign( a );
6986     bSign = extractFloatx80Sign( b );
6987     if ( aSign != bSign ) {
6988 
6989         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6990              ( ( a.low | b.low ) == 0 ) ) {
6991             /* zero case */
6992             return float_relation_equal;
6993         } else {
6994             return 1 - (2 * aSign);
6995         }
6996     } else {
6997         /* Normalize pseudo-denormals before comparison.  */
6998         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
6999             ++a.high;
7000         }
7001         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7002             ++b.high;
7003         }
7004         if (a.low == b.low && a.high == b.high) {
7005             return float_relation_equal;
7006         } else {
7007             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7008         }
7009     }
7010 }
7011 
7012 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7013 {
7014     return floatx80_compare_internal(a, b, 0, status);
7015 }
7016 
7017 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7018                                      float_status *status)
7019 {
7020     return floatx80_compare_internal(a, b, 1, status);
7021 }
7022 
7023 static inline FloatRelation
7024 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7025                           float_status *status)
7026 {
7027     bool aSign, bSign;
7028 
7029     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7030           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7031         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7032           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7033         if (!is_quiet ||
7034             float128_is_signaling_nan(a, status) ||
7035             float128_is_signaling_nan(b, status)) {
7036             float_raise(float_flag_invalid, status);
7037         }
7038         return float_relation_unordered;
7039     }
7040     aSign = extractFloat128Sign( a );
7041     bSign = extractFloat128Sign( b );
7042     if ( aSign != bSign ) {
7043         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7044             /* zero case */
7045             return float_relation_equal;
7046         } else {
7047             return 1 - (2 * aSign);
7048         }
7049     } else {
7050         if (a.low == b.low && a.high == b.high) {
7051             return float_relation_equal;
7052         } else {
7053             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7054         }
7055     }
7056 }
7057 
7058 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7059 {
7060     return float128_compare_internal(a, b, 0, status);
7061 }
7062 
7063 FloatRelation float128_compare_quiet(float128 a, float128 b,
7064                                      float_status *status)
7065 {
7066     return float128_compare_internal(a, b, 1, status);
7067 }
7068 
7069 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7070 {
7071     bool aSign;
7072     int32_t aExp;
7073     uint64_t aSig;
7074 
7075     if (floatx80_invalid_encoding(a)) {
7076         float_raise(float_flag_invalid, status);
7077         return floatx80_default_nan(status);
7078     }
7079     aSig = extractFloatx80Frac( a );
7080     aExp = extractFloatx80Exp( a );
7081     aSign = extractFloatx80Sign( a );
7082 
7083     if ( aExp == 0x7FFF ) {
7084         if ( aSig<<1 ) {
7085             return propagateFloatx80NaN(a, a, status);
7086         }
7087         return a;
7088     }
7089 
7090     if (aExp == 0) {
7091         if (aSig == 0) {
7092             return a;
7093         }
7094         aExp++;
7095     }
7096 
7097     if (n > 0x10000) {
7098         n = 0x10000;
7099     } else if (n < -0x10000) {
7100         n = -0x10000;
7101     }
7102 
7103     aExp += n;
7104     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7105                                          aSign, aExp, aSig, 0, status);
7106 }
7107 
7108 float128 float128_scalbn(float128 a, int n, float_status *status)
7109 {
7110     bool aSign;
7111     int32_t aExp;
7112     uint64_t aSig0, aSig1;
7113 
7114     aSig1 = extractFloat128Frac1( a );
7115     aSig0 = extractFloat128Frac0( a );
7116     aExp = extractFloat128Exp( a );
7117     aSign = extractFloat128Sign( a );
7118     if ( aExp == 0x7FFF ) {
7119         if ( aSig0 | aSig1 ) {
7120             return propagateFloat128NaN(a, a, status);
7121         }
7122         return a;
7123     }
7124     if (aExp != 0) {
7125         aSig0 |= UINT64_C(0x0001000000000000);
7126     } else if (aSig0 == 0 && aSig1 == 0) {
7127         return a;
7128     } else {
7129         aExp++;
7130     }
7131 
7132     if (n > 0x10000) {
7133         n = 0x10000;
7134     } else if (n < -0x10000) {
7135         n = -0x10000;
7136     }
7137 
7138     aExp += n - 1;
7139     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7140                                          , status);
7141 
7142 }
7143 
7144 static void __attribute__((constructor)) softfloat_init(void)
7145 {
7146     union_float64 ua, ub, uc, ur;
7147 
7148     if (QEMU_NO_HARDFLOAT) {
7149         return;
7150     }
7151     /*
7152      * Test that the host's FMA is not obviously broken. For example,
7153      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7154      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7155      */
7156     ua.s = 0x0020000000000001ULL;
7157     ub.s = 0x3ca0000000000000ULL;
7158     uc.s = 0x0020000000000000ULL;
7159     ur.h = fma(ua.h, ub.h, uc.h);
7160     if (ur.s != 0x0020000000000001ULL) {
7161         force_soft_fma = true;
7162     }
7163 }
7164