xref: /openbmc/qemu/fpu/softfloat.c (revision 93dd625f)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         s->float_exception_flags |= float_flag_overflow;
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         s->float_exception_flags |= float_flag_overflow;
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
474 {
475     return unlikely(c >= float_class_qnan);
476 }
477 
478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
479 {
480     return c == float_class_snan;
481 }
482 
483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
484 {
485     return c == float_class_qnan;
486 }
487 
488 /*
489  * Structure holding all of the decomposed parts of a float. The
490  * exponent is unbiased and the fraction is normalized. All
491  * calculations are done with a 64 bit fraction and then rounded as
492  * appropriate for the final format.
493  *
494  * Thanks to the packed FloatClass a decent compiler should be able to
495  * fit the whole structure into registers and avoid using the stack
496  * for parameter passing.
497  */
498 
499 typedef struct {
500     uint64_t frac;
501     int32_t  exp;
502     FloatClass cls;
503     bool sign;
504 } FloatParts;
505 
506 #define DECOMPOSED_BINARY_POINT    (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
509 
510 /* Structure holding all of the relevant parameters for a format.
511  *   exp_size: the size of the exponent field
512  *   exp_bias: the offset applied to the exponent field
513  *   exp_max: the maximum normalised exponent
514  *   frac_size: the size of the fraction field
515  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516  * The following are computed based the size of fraction
517  *   frac_lsb: least significant bit of fraction
518  *   frac_lsbm1: the bit below the least significant bit (for rounding)
519  *   round_mask/roundeven_mask: masks used for rounding
520  * The following optional modifiers are available:
521  *   arm_althp: handle ARM Alternative Half Precision
522  */
523 typedef struct {
524     int exp_size;
525     int exp_bias;
526     int exp_max;
527     int frac_size;
528     int frac_shift;
529     uint64_t frac_lsb;
530     uint64_t frac_lsbm1;
531     uint64_t round_mask;
532     uint64_t roundeven_mask;
533     bool arm_althp;
534 } FloatFmt;
535 
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F)                                           \
538     .exp_size       = E,                                             \
539     .exp_bias       = ((1 << E) - 1) >> 1,                           \
540     .exp_max        = (1 << E) - 1,                                  \
541     .frac_size      = F,                                             \
542     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
543     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
544     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
545     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
546     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
547 
548 static const FloatFmt float16_params = {
549     FLOAT_PARAMS(5, 10)
550 };
551 
552 static const FloatFmt float16_params_ahp = {
553     FLOAT_PARAMS(5, 10),
554     .arm_althp = true
555 };
556 
557 static const FloatFmt float32_params = {
558     FLOAT_PARAMS(8, 23)
559 };
560 
561 static const FloatFmt float64_params = {
562     FLOAT_PARAMS(11, 52)
563 };
564 
565 /* Unpack a float to parts, but do not canonicalize.  */
566 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
567 {
568     const int sign_pos = fmt.frac_size + fmt.exp_size;
569 
570     return (FloatParts) {
571         .cls = float_class_unclassified,
572         .sign = extract64(raw, sign_pos, 1),
573         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
574         .frac = extract64(raw, 0, fmt.frac_size),
575     };
576 }
577 
578 static inline FloatParts float16_unpack_raw(float16 f)
579 {
580     return unpack_raw(float16_params, f);
581 }
582 
583 static inline FloatParts float32_unpack_raw(float32 f)
584 {
585     return unpack_raw(float32_params, f);
586 }
587 
588 static inline FloatParts float64_unpack_raw(float64 f)
589 {
590     return unpack_raw(float64_params, f);
591 }
592 
593 /* Pack a float from parts, but do not canonicalize.  */
594 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
595 {
596     const int sign_pos = fmt.frac_size + fmt.exp_size;
597     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
598     return deposit64(ret, sign_pos, 1, p.sign);
599 }
600 
601 static inline float16 float16_pack_raw(FloatParts p)
602 {
603     return make_float16(pack_raw(float16_params, p));
604 }
605 
606 static inline float32 float32_pack_raw(FloatParts p)
607 {
608     return make_float32(pack_raw(float32_params, p));
609 }
610 
611 static inline float64 float64_pack_raw(FloatParts p)
612 {
613     return make_float64(pack_raw(float64_params, p));
614 }
615 
616 /*----------------------------------------------------------------------------
617 | Functions and definitions to determine:  (1) whether tininess for underflow
618 | is detected before or after rounding by default, (2) what (if anything)
619 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
620 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
621 | are propagated from function inputs to output.  These details are target-
622 | specific.
623 *----------------------------------------------------------------------------*/
624 #include "softfloat-specialize.inc.c"
625 
626 /* Canonicalize EXP and FRAC, setting CLS.  */
627 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
628                                   float_status *status)
629 {
630     if (part.exp == parm->exp_max && !parm->arm_althp) {
631         if (part.frac == 0) {
632             part.cls = float_class_inf;
633         } else {
634             part.frac <<= parm->frac_shift;
635             part.cls = (parts_is_snan_frac(part.frac, status)
636                         ? float_class_snan : float_class_qnan);
637         }
638     } else if (part.exp == 0) {
639         if (likely(part.frac == 0)) {
640             part.cls = float_class_zero;
641         } else if (status->flush_inputs_to_zero) {
642             float_raise(float_flag_input_denormal, status);
643             part.cls = float_class_zero;
644             part.frac = 0;
645         } else {
646             int shift = clz64(part.frac) - 1;
647             part.cls = float_class_normal;
648             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
649             part.frac <<= shift;
650         }
651     } else {
652         part.cls = float_class_normal;
653         part.exp -= parm->exp_bias;
654         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
655     }
656     return part;
657 }
658 
659 /* Round and uncanonicalize a floating-point number by parts. There
660  * are FRAC_SHIFT bits that may require rounding at the bottom of the
661  * fraction; these bits will be removed. The exponent will be biased
662  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
663  */
664 
665 static FloatParts round_canonical(FloatParts p, float_status *s,
666                                   const FloatFmt *parm)
667 {
668     const uint64_t frac_lsb = parm->frac_lsb;
669     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
670     const uint64_t round_mask = parm->round_mask;
671     const uint64_t roundeven_mask = parm->roundeven_mask;
672     const int exp_max = parm->exp_max;
673     const int frac_shift = parm->frac_shift;
674     uint64_t frac, inc;
675     int exp, flags = 0;
676     bool overflow_norm;
677 
678     frac = p.frac;
679     exp = p.exp;
680 
681     switch (p.cls) {
682     case float_class_normal:
683         switch (s->float_rounding_mode) {
684         case float_round_nearest_even:
685             overflow_norm = false;
686             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
687             break;
688         case float_round_ties_away:
689             overflow_norm = false;
690             inc = frac_lsbm1;
691             break;
692         case float_round_to_zero:
693             overflow_norm = true;
694             inc = 0;
695             break;
696         case float_round_up:
697             inc = p.sign ? 0 : round_mask;
698             overflow_norm = p.sign;
699             break;
700         case float_round_down:
701             inc = p.sign ? round_mask : 0;
702             overflow_norm = !p.sign;
703             break;
704         case float_round_to_odd:
705             overflow_norm = true;
706             inc = frac & frac_lsb ? 0 : round_mask;
707             break;
708         default:
709             g_assert_not_reached();
710         }
711 
712         exp += parm->exp_bias;
713         if (likely(exp > 0)) {
714             if (frac & round_mask) {
715                 flags |= float_flag_inexact;
716                 frac += inc;
717                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
718                     frac >>= 1;
719                     exp++;
720                 }
721             }
722             frac >>= frac_shift;
723 
724             if (parm->arm_althp) {
725                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
726                 if (unlikely(exp > exp_max)) {
727                     /* Overflow.  Return the maximum normal.  */
728                     flags = float_flag_invalid;
729                     exp = exp_max;
730                     frac = -1;
731                 }
732             } else if (unlikely(exp >= exp_max)) {
733                 flags |= float_flag_overflow | float_flag_inexact;
734                 if (overflow_norm) {
735                     exp = exp_max - 1;
736                     frac = -1;
737                 } else {
738                     p.cls = float_class_inf;
739                     goto do_inf;
740                 }
741             }
742         } else if (s->flush_to_zero) {
743             flags |= float_flag_output_denormal;
744             p.cls = float_class_zero;
745             goto do_zero;
746         } else {
747             bool is_tiny = s->tininess_before_rounding
748                         || (exp < 0)
749                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
750 
751             shift64RightJamming(frac, 1 - exp, &frac);
752             if (frac & round_mask) {
753                 /* Need to recompute round-to-even.  */
754                 switch (s->float_rounding_mode) {
755                 case float_round_nearest_even:
756                     inc = ((frac & roundeven_mask) != frac_lsbm1
757                            ? frac_lsbm1 : 0);
758                     break;
759                 case float_round_to_odd:
760                     inc = frac & frac_lsb ? 0 : round_mask;
761                     break;
762                 default:
763                     break;
764                 }
765                 flags |= float_flag_inexact;
766                 frac += inc;
767             }
768 
769             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
770             frac >>= frac_shift;
771 
772             if (is_tiny && (flags & float_flag_inexact)) {
773                 flags |= float_flag_underflow;
774             }
775             if (exp == 0 && frac == 0) {
776                 p.cls = float_class_zero;
777             }
778         }
779         break;
780 
781     case float_class_zero:
782     do_zero:
783         exp = 0;
784         frac = 0;
785         break;
786 
787     case float_class_inf:
788     do_inf:
789         assert(!parm->arm_althp);
790         exp = exp_max;
791         frac = 0;
792         break;
793 
794     case float_class_qnan:
795     case float_class_snan:
796         assert(!parm->arm_althp);
797         exp = exp_max;
798         frac >>= parm->frac_shift;
799         break;
800 
801     default:
802         g_assert_not_reached();
803     }
804 
805     float_raise(flags, s);
806     p.exp = exp;
807     p.frac = frac;
808     return p;
809 }
810 
811 /* Explicit FloatFmt version */
812 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
813                                             const FloatFmt *params)
814 {
815     return sf_canonicalize(float16_unpack_raw(f), params, s);
816 }
817 
818 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
819 {
820     return float16a_unpack_canonical(f, s, &float16_params);
821 }
822 
823 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
824                                              const FloatFmt *params)
825 {
826     return float16_pack_raw(round_canonical(p, s, params));
827 }
828 
829 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
830 {
831     return float16a_round_pack_canonical(p, s, &float16_params);
832 }
833 
834 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
835 {
836     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
837 }
838 
839 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
840 {
841     return float32_pack_raw(round_canonical(p, s, &float32_params));
842 }
843 
844 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
845 {
846     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
847 }
848 
849 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float64_pack_raw(round_canonical(p, s, &float64_params));
852 }
853 
854 static FloatParts return_nan(FloatParts a, float_status *s)
855 {
856     switch (a.cls) {
857     case float_class_snan:
858         s->float_exception_flags |= float_flag_invalid;
859         a = parts_silence_nan(a, s);
860         /* fall through */
861     case float_class_qnan:
862         if (s->default_nan_mode) {
863             return parts_default_nan(s);
864         }
865         break;
866 
867     default:
868         g_assert_not_reached();
869     }
870     return a;
871 }
872 
873 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
874 {
875     if (is_snan(a.cls) || is_snan(b.cls)) {
876         s->float_exception_flags |= float_flag_invalid;
877     }
878 
879     if (s->default_nan_mode) {
880         return parts_default_nan(s);
881     } else {
882         if (pickNaN(a.cls, b.cls,
883                     a.frac > b.frac ||
884                     (a.frac == b.frac && a.sign < b.sign))) {
885             a = b;
886         }
887         if (is_snan(a.cls)) {
888             return parts_silence_nan(a, s);
889         }
890     }
891     return a;
892 }
893 
894 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
895                                   bool inf_zero, float_status *s)
896 {
897     int which;
898 
899     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
900         s->float_exception_flags |= float_flag_invalid;
901     }
902 
903     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
904 
905     if (s->default_nan_mode) {
906         /* Note that this check is after pickNaNMulAdd so that function
907          * has an opportunity to set the Invalid flag.
908          */
909         which = 3;
910     }
911 
912     switch (which) {
913     case 0:
914         break;
915     case 1:
916         a = b;
917         break;
918     case 2:
919         a = c;
920         break;
921     case 3:
922         return parts_default_nan(s);
923     default:
924         g_assert_not_reached();
925     }
926 
927     if (is_snan(a.cls)) {
928         return parts_silence_nan(a, s);
929     }
930     return a;
931 }
932 
933 /*
934  * Returns the result of adding or subtracting the values of the
935  * floating-point values `a' and `b'. The operation is performed
936  * according to the IEC/IEEE Standard for Binary Floating-Point
937  * Arithmetic.
938  */
939 
940 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
941                                 float_status *s)
942 {
943     bool a_sign = a.sign;
944     bool b_sign = b.sign ^ subtract;
945 
946     if (a_sign != b_sign) {
947         /* Subtraction */
948 
949         if (a.cls == float_class_normal && b.cls == float_class_normal) {
950             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
951                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
952                 a.frac = a.frac - b.frac;
953             } else {
954                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
955                 a.frac = b.frac - a.frac;
956                 a.exp = b.exp;
957                 a_sign ^= 1;
958             }
959 
960             if (a.frac == 0) {
961                 a.cls = float_class_zero;
962                 a.sign = s->float_rounding_mode == float_round_down;
963             } else {
964                 int shift = clz64(a.frac) - 1;
965                 a.frac = a.frac << shift;
966                 a.exp = a.exp - shift;
967                 a.sign = a_sign;
968             }
969             return a;
970         }
971         if (is_nan(a.cls) || is_nan(b.cls)) {
972             return pick_nan(a, b, s);
973         }
974         if (a.cls == float_class_inf) {
975             if (b.cls == float_class_inf) {
976                 float_raise(float_flag_invalid, s);
977                 return parts_default_nan(s);
978             }
979             return a;
980         }
981         if (a.cls == float_class_zero && b.cls == float_class_zero) {
982             a.sign = s->float_rounding_mode == float_round_down;
983             return a;
984         }
985         if (a.cls == float_class_zero || b.cls == float_class_inf) {
986             b.sign = a_sign ^ 1;
987             return b;
988         }
989         if (b.cls == float_class_zero) {
990             return a;
991         }
992     } else {
993         /* Addition */
994         if (a.cls == float_class_normal && b.cls == float_class_normal) {
995             if (a.exp > b.exp) {
996                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
997             } else if (a.exp < b.exp) {
998                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
999                 a.exp = b.exp;
1000             }
1001             a.frac += b.frac;
1002             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1003                 shift64RightJamming(a.frac, 1, &a.frac);
1004                 a.exp += 1;
1005             }
1006             return a;
1007         }
1008         if (is_nan(a.cls) || is_nan(b.cls)) {
1009             return pick_nan(a, b, s);
1010         }
1011         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1012             return a;
1013         }
1014         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1015             b.sign = b_sign;
1016             return b;
1017         }
1018     }
1019     g_assert_not_reached();
1020 }
1021 
1022 /*
1023  * Returns the result of adding or subtracting the floating-point
1024  * values `a' and `b'. The operation is performed according to the
1025  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1026  */
1027 
1028 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1029 {
1030     FloatParts pa = float16_unpack_canonical(a, status);
1031     FloatParts pb = float16_unpack_canonical(b, status);
1032     FloatParts pr = addsub_floats(pa, pb, false, status);
1033 
1034     return float16_round_pack_canonical(pr, status);
1035 }
1036 
1037 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1038 {
1039     FloatParts pa = float16_unpack_canonical(a, status);
1040     FloatParts pb = float16_unpack_canonical(b, status);
1041     FloatParts pr = addsub_floats(pa, pb, true, status);
1042 
1043     return float16_round_pack_canonical(pr, status);
1044 }
1045 
1046 static float32 QEMU_SOFTFLOAT_ATTR
1047 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1048 {
1049     FloatParts pa = float32_unpack_canonical(a, status);
1050     FloatParts pb = float32_unpack_canonical(b, status);
1051     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1052 
1053     return float32_round_pack_canonical(pr, status);
1054 }
1055 
1056 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1057 {
1058     return soft_f32_addsub(a, b, false, status);
1059 }
1060 
1061 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1062 {
1063     return soft_f32_addsub(a, b, true, status);
1064 }
1065 
1066 static float64 QEMU_SOFTFLOAT_ATTR
1067 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float64_unpack_canonical(a, status);
1070     FloatParts pb = float64_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float64_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1077 {
1078     return soft_f64_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1082 {
1083     return soft_f64_addsub(a, b, true, status);
1084 }
1085 
1086 static float hard_f32_add(float a, float b)
1087 {
1088     return a + b;
1089 }
1090 
1091 static float hard_f32_sub(float a, float b)
1092 {
1093     return a - b;
1094 }
1095 
1096 static double hard_f64_add(double a, double b)
1097 {
1098     return a + b;
1099 }
1100 
1101 static double hard_f64_sub(double a, double b)
1102 {
1103     return a - b;
1104 }
1105 
1106 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1107 {
1108     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1109         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1110     }
1111     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1112 }
1113 
1114 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1115 {
1116     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1117         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1118     } else {
1119         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1120     }
1121 }
1122 
1123 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1124                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1125 {
1126     return float32_gen2(a, b, s, hard, soft,
1127                         f32_is_zon2, f32_addsubmul_post);
1128 }
1129 
1130 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1131                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1132 {
1133     return float64_gen2(a, b, s, hard, soft,
1134                         f64_is_zon2, f64_addsubmul_post);
1135 }
1136 
1137 float32 QEMU_FLATTEN
1138 float32_add(float32 a, float32 b, float_status *s)
1139 {
1140     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1141 }
1142 
1143 float32 QEMU_FLATTEN
1144 float32_sub(float32 a, float32 b, float_status *s)
1145 {
1146     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1147 }
1148 
1149 float64 QEMU_FLATTEN
1150 float64_add(float64 a, float64 b, float_status *s)
1151 {
1152     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1153 }
1154 
1155 float64 QEMU_FLATTEN
1156 float64_sub(float64 a, float64 b, float_status *s)
1157 {
1158     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1159 }
1160 
1161 /*
1162  * Returns the result of multiplying the floating-point values `a' and
1163  * `b'. The operation is performed according to the IEC/IEEE Standard
1164  * for Binary Floating-Point Arithmetic.
1165  */
1166 
1167 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1168 {
1169     bool sign = a.sign ^ b.sign;
1170 
1171     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1172         uint64_t hi, lo;
1173         int exp = a.exp + b.exp;
1174 
1175         mul64To128(a.frac, b.frac, &hi, &lo);
1176         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1177         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1178             shift64RightJamming(lo, 1, &lo);
1179             exp += 1;
1180         }
1181 
1182         /* Re-use a */
1183         a.exp = exp;
1184         a.sign = sign;
1185         a.frac = lo;
1186         return a;
1187     }
1188     /* handle all the NaN cases */
1189     if (is_nan(a.cls) || is_nan(b.cls)) {
1190         return pick_nan(a, b, s);
1191     }
1192     /* Inf * Zero == NaN */
1193     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1194         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1195         s->float_exception_flags |= float_flag_invalid;
1196         return parts_default_nan(s);
1197     }
1198     /* Multiply by 0 or Inf */
1199     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1200         a.sign = sign;
1201         return a;
1202     }
1203     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1204         b.sign = sign;
1205         return b;
1206     }
1207     g_assert_not_reached();
1208 }
1209 
1210 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1211 {
1212     FloatParts pa = float16_unpack_canonical(a, status);
1213     FloatParts pb = float16_unpack_canonical(b, status);
1214     FloatParts pr = mul_floats(pa, pb, status);
1215 
1216     return float16_round_pack_canonical(pr, status);
1217 }
1218 
1219 static float32 QEMU_SOFTFLOAT_ATTR
1220 soft_f32_mul(float32 a, float32 b, float_status *status)
1221 {
1222     FloatParts pa = float32_unpack_canonical(a, status);
1223     FloatParts pb = float32_unpack_canonical(b, status);
1224     FloatParts pr = mul_floats(pa, pb, status);
1225 
1226     return float32_round_pack_canonical(pr, status);
1227 }
1228 
1229 static float64 QEMU_SOFTFLOAT_ATTR
1230 soft_f64_mul(float64 a, float64 b, float_status *status)
1231 {
1232     FloatParts pa = float64_unpack_canonical(a, status);
1233     FloatParts pb = float64_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float64_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float hard_f32_mul(float a, float b)
1240 {
1241     return a * b;
1242 }
1243 
1244 static double hard_f64_mul(double a, double b)
1245 {
1246     return a * b;
1247 }
1248 
1249 float32 QEMU_FLATTEN
1250 float32_mul(float32 a, float32 b, float_status *s)
1251 {
1252     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1253                         f32_is_zon2, f32_addsubmul_post);
1254 }
1255 
1256 float64 QEMU_FLATTEN
1257 float64_mul(float64 a, float64 b, float_status *s)
1258 {
1259     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1260                         f64_is_zon2, f64_addsubmul_post);
1261 }
1262 
1263 /*
1264  * Returns the result of multiplying the floating-point values `a' and
1265  * `b' then adding 'c', with no intermediate rounding step after the
1266  * multiplication. The operation is performed according to the
1267  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1268  * The flags argument allows the caller to select negation of the
1269  * addend, the intermediate product, or the final result. (The
1270  * difference between this and having the caller do a separate
1271  * negation is that negating externally will flip the sign bit on
1272  * NaNs.)
1273  */
1274 
1275 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1276                                 int flags, float_status *s)
1277 {
1278     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1279                     ((1 << float_class_inf) | (1 << float_class_zero));
1280     bool p_sign;
1281     bool sign_flip = flags & float_muladd_negate_result;
1282     FloatClass p_class;
1283     uint64_t hi, lo;
1284     int p_exp;
1285 
1286     /* It is implementation-defined whether the cases of (0,inf,qnan)
1287      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1288      * they return if they do), so we have to hand this information
1289      * off to the target-specific pick-a-NaN routine.
1290      */
1291     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1292         return pick_nan_muladd(a, b, c, inf_zero, s);
1293     }
1294 
1295     if (inf_zero) {
1296         s->float_exception_flags |= float_flag_invalid;
1297         return parts_default_nan(s);
1298     }
1299 
1300     if (flags & float_muladd_negate_c) {
1301         c.sign ^= 1;
1302     }
1303 
1304     p_sign = a.sign ^ b.sign;
1305 
1306     if (flags & float_muladd_negate_product) {
1307         p_sign ^= 1;
1308     }
1309 
1310     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1311         p_class = float_class_inf;
1312     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1313         p_class = float_class_zero;
1314     } else {
1315         p_class = float_class_normal;
1316     }
1317 
1318     if (c.cls == float_class_inf) {
1319         if (p_class == float_class_inf && p_sign != c.sign) {
1320             s->float_exception_flags |= float_flag_invalid;
1321             return parts_default_nan(s);
1322         } else {
1323             a.cls = float_class_inf;
1324             a.sign = c.sign ^ sign_flip;
1325             return a;
1326         }
1327     }
1328 
1329     if (p_class == float_class_inf) {
1330         a.cls = float_class_inf;
1331         a.sign = p_sign ^ sign_flip;
1332         return a;
1333     }
1334 
1335     if (p_class == float_class_zero) {
1336         if (c.cls == float_class_zero) {
1337             if (p_sign != c.sign) {
1338                 p_sign = s->float_rounding_mode == float_round_down;
1339             }
1340             c.sign = p_sign;
1341         } else if (flags & float_muladd_halve_result) {
1342             c.exp -= 1;
1343         }
1344         c.sign ^= sign_flip;
1345         return c;
1346     }
1347 
1348     /* a & b should be normals now... */
1349     assert(a.cls == float_class_normal &&
1350            b.cls == float_class_normal);
1351 
1352     p_exp = a.exp + b.exp;
1353 
1354     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1355      * result.
1356      */
1357     mul64To128(a.frac, b.frac, &hi, &lo);
1358     /* binary point now at bit 124 */
1359 
1360     /* check for overflow */
1361     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1362         shift128RightJamming(hi, lo, 1, &hi, &lo);
1363         p_exp += 1;
1364     }
1365 
1366     /* + add/sub */
1367     if (c.cls == float_class_zero) {
1368         /* move binary point back to 62 */
1369         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1370     } else {
1371         int exp_diff = p_exp - c.exp;
1372         if (p_sign == c.sign) {
1373             /* Addition */
1374             if (exp_diff <= 0) {
1375                 shift128RightJamming(hi, lo,
1376                                      DECOMPOSED_BINARY_POINT - exp_diff,
1377                                      &hi, &lo);
1378                 lo += c.frac;
1379                 p_exp = c.exp;
1380             } else {
1381                 uint64_t c_hi, c_lo;
1382                 /* shift c to the same binary point as the product (124) */
1383                 c_hi = c.frac >> 2;
1384                 c_lo = 0;
1385                 shift128RightJamming(c_hi, c_lo,
1386                                      exp_diff,
1387                                      &c_hi, &c_lo);
1388                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1389                 /* move binary point back to 62 */
1390                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1391             }
1392 
1393             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1394                 shift64RightJamming(lo, 1, &lo);
1395                 p_exp += 1;
1396             }
1397 
1398         } else {
1399             /* Subtraction */
1400             uint64_t c_hi, c_lo;
1401             /* make C binary point match product at bit 124 */
1402             c_hi = c.frac >> 2;
1403             c_lo = 0;
1404 
1405             if (exp_diff <= 0) {
1406                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1407                 if (exp_diff == 0
1408                     &&
1409                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1410                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1411                 } else {
1412                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1413                     p_sign ^= 1;
1414                     p_exp = c.exp;
1415                 }
1416             } else {
1417                 shift128RightJamming(c_hi, c_lo,
1418                                      exp_diff,
1419                                      &c_hi, &c_lo);
1420                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1421             }
1422 
1423             if (hi == 0 && lo == 0) {
1424                 a.cls = float_class_zero;
1425                 a.sign = s->float_rounding_mode == float_round_down;
1426                 a.sign ^= sign_flip;
1427                 return a;
1428             } else {
1429                 int shift;
1430                 if (hi != 0) {
1431                     shift = clz64(hi);
1432                 } else {
1433                     shift = clz64(lo) + 64;
1434                 }
1435                 /* Normalizing to a binary point of 124 is the
1436                    correct adjust for the exponent.  However since we're
1437                    shifting, we might as well put the binary point back
1438                    at 62 where we really want it.  Therefore shift as
1439                    if we're leaving 1 bit at the top of the word, but
1440                    adjust the exponent as if we're leaving 3 bits.  */
1441                 shift -= 1;
1442                 if (shift >= 64) {
1443                     lo = lo << (shift - 64);
1444                 } else {
1445                     hi = (hi << shift) | (lo >> (64 - shift));
1446                     lo = hi | ((lo << shift) != 0);
1447                 }
1448                 p_exp -= shift - 2;
1449             }
1450         }
1451     }
1452 
1453     if (flags & float_muladd_halve_result) {
1454         p_exp -= 1;
1455     }
1456 
1457     /* finally prepare our result */
1458     a.cls = float_class_normal;
1459     a.sign = p_sign ^ sign_flip;
1460     a.exp = p_exp;
1461     a.frac = lo;
1462 
1463     return a;
1464 }
1465 
1466 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1467                                                 int flags, float_status *status)
1468 {
1469     FloatParts pa = float16_unpack_canonical(a, status);
1470     FloatParts pb = float16_unpack_canonical(b, status);
1471     FloatParts pc = float16_unpack_canonical(c, status);
1472     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1473 
1474     return float16_round_pack_canonical(pr, status);
1475 }
1476 
1477 static float32 QEMU_SOFTFLOAT_ATTR
1478 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1479                 float_status *status)
1480 {
1481     FloatParts pa = float32_unpack_canonical(a, status);
1482     FloatParts pb = float32_unpack_canonical(b, status);
1483     FloatParts pc = float32_unpack_canonical(c, status);
1484     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1485 
1486     return float32_round_pack_canonical(pr, status);
1487 }
1488 
1489 static float64 QEMU_SOFTFLOAT_ATTR
1490 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1491                 float_status *status)
1492 {
1493     FloatParts pa = float64_unpack_canonical(a, status);
1494     FloatParts pb = float64_unpack_canonical(b, status);
1495     FloatParts pc = float64_unpack_canonical(c, status);
1496     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1497 
1498     return float64_round_pack_canonical(pr, status);
1499 }
1500 
1501 static bool force_soft_fma;
1502 
1503 float32 QEMU_FLATTEN
1504 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1505 {
1506     union_float32 ua, ub, uc, ur;
1507 
1508     ua.s = xa;
1509     ub.s = xb;
1510     uc.s = xc;
1511 
1512     if (unlikely(!can_use_fpu(s))) {
1513         goto soft;
1514     }
1515     if (unlikely(flags & float_muladd_halve_result)) {
1516         goto soft;
1517     }
1518 
1519     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1520     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1521         goto soft;
1522     }
1523 
1524     if (unlikely(force_soft_fma)) {
1525         goto soft;
1526     }
1527 
1528     /*
1529      * When (a || b) == 0, there's no need to check for under/over flow,
1530      * since we know the addend is (normal || 0) and the product is 0.
1531      */
1532     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1533         union_float32 up;
1534         bool prod_sign;
1535 
1536         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1537         prod_sign ^= !!(flags & float_muladd_negate_product);
1538         up.s = float32_set_sign(float32_zero, prod_sign);
1539 
1540         if (flags & float_muladd_negate_c) {
1541             uc.h = -uc.h;
1542         }
1543         ur.h = up.h + uc.h;
1544     } else {
1545         union_float32 ua_orig = ua;
1546         union_float32 uc_orig = uc;
1547 
1548         if (flags & float_muladd_negate_product) {
1549             ua.h = -ua.h;
1550         }
1551         if (flags & float_muladd_negate_c) {
1552             uc.h = -uc.h;
1553         }
1554 
1555         ur.h = fmaf(ua.h, ub.h, uc.h);
1556 
1557         if (unlikely(f32_is_inf(ur))) {
1558             s->float_exception_flags |= float_flag_overflow;
1559         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1560             ua = ua_orig;
1561             uc = uc_orig;
1562             goto soft;
1563         }
1564     }
1565     if (flags & float_muladd_negate_result) {
1566         return float32_chs(ur.s);
1567     }
1568     return ur.s;
1569 
1570  soft:
1571     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1572 }
1573 
1574 float64 QEMU_FLATTEN
1575 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1576 {
1577     union_float64 ua, ub, uc, ur;
1578 
1579     ua.s = xa;
1580     ub.s = xb;
1581     uc.s = xc;
1582 
1583     if (unlikely(!can_use_fpu(s))) {
1584         goto soft;
1585     }
1586     if (unlikely(flags & float_muladd_halve_result)) {
1587         goto soft;
1588     }
1589 
1590     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1591     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1592         goto soft;
1593     }
1594 
1595     if (unlikely(force_soft_fma)) {
1596         goto soft;
1597     }
1598 
1599     /*
1600      * When (a || b) == 0, there's no need to check for under/over flow,
1601      * since we know the addend is (normal || 0) and the product is 0.
1602      */
1603     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1604         union_float64 up;
1605         bool prod_sign;
1606 
1607         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1608         prod_sign ^= !!(flags & float_muladd_negate_product);
1609         up.s = float64_set_sign(float64_zero, prod_sign);
1610 
1611         if (flags & float_muladd_negate_c) {
1612             uc.h = -uc.h;
1613         }
1614         ur.h = up.h + uc.h;
1615     } else {
1616         union_float64 ua_orig = ua;
1617         union_float64 uc_orig = uc;
1618 
1619         if (flags & float_muladd_negate_product) {
1620             ua.h = -ua.h;
1621         }
1622         if (flags & float_muladd_negate_c) {
1623             uc.h = -uc.h;
1624         }
1625 
1626         ur.h = fma(ua.h, ub.h, uc.h);
1627 
1628         if (unlikely(f64_is_inf(ur))) {
1629             s->float_exception_flags |= float_flag_overflow;
1630         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1631             ua = ua_orig;
1632             uc = uc_orig;
1633             goto soft;
1634         }
1635     }
1636     if (flags & float_muladd_negate_result) {
1637         return float64_chs(ur.s);
1638     }
1639     return ur.s;
1640 
1641  soft:
1642     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1643 }
1644 
1645 /*
1646  * Returns the result of dividing the floating-point value `a' by the
1647  * corresponding value `b'. The operation is performed according to
1648  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1649  */
1650 
1651 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1652 {
1653     bool sign = a.sign ^ b.sign;
1654 
1655     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1656         uint64_t n0, n1, q, r;
1657         int exp = a.exp - b.exp;
1658 
1659         /*
1660          * We want a 2*N / N-bit division to produce exactly an N-bit
1661          * result, so that we do not lose any precision and so that we
1662          * do not have to renormalize afterward.  If A.frac < B.frac,
1663          * then division would produce an (N-1)-bit result; shift A left
1664          * by one to produce the an N-bit result, and decrement the
1665          * exponent to match.
1666          *
1667          * The udiv_qrnnd algorithm that we're using requires normalization,
1668          * i.e. the msb of the denominator must be set.  Since we know that
1669          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1670          * by one (more), and the remainder must be shifted right by one.
1671          */
1672         if (a.frac < b.frac) {
1673             exp -= 1;
1674             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1675         } else {
1676             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1677         }
1678         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1679 
1680         /*
1681          * Set lsb if there is a remainder, to set inexact.
1682          * As mentioned above, to find the actual value of the remainder we
1683          * would need to shift right, but (1) we are only concerned about
1684          * non-zero-ness, and (2) the remainder will always be even because
1685          * both inputs to the division primitive are even.
1686          */
1687         a.frac = q | (r != 0);
1688         a.sign = sign;
1689         a.exp = exp;
1690         return a;
1691     }
1692     /* handle all the NaN cases */
1693     if (is_nan(a.cls) || is_nan(b.cls)) {
1694         return pick_nan(a, b, s);
1695     }
1696     /* 0/0 or Inf/Inf */
1697     if (a.cls == b.cls
1698         &&
1699         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1700         s->float_exception_flags |= float_flag_invalid;
1701         return parts_default_nan(s);
1702     }
1703     /* Inf / x or 0 / x */
1704     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1705         a.sign = sign;
1706         return a;
1707     }
1708     /* Div 0 => Inf */
1709     if (b.cls == float_class_zero) {
1710         s->float_exception_flags |= float_flag_divbyzero;
1711         a.cls = float_class_inf;
1712         a.sign = sign;
1713         return a;
1714     }
1715     /* Div by Inf */
1716     if (b.cls == float_class_inf) {
1717         a.cls = float_class_zero;
1718         a.sign = sign;
1719         return a;
1720     }
1721     g_assert_not_reached();
1722 }
1723 
1724 float16 float16_div(float16 a, float16 b, float_status *status)
1725 {
1726     FloatParts pa = float16_unpack_canonical(a, status);
1727     FloatParts pb = float16_unpack_canonical(b, status);
1728     FloatParts pr = div_floats(pa, pb, status);
1729 
1730     return float16_round_pack_canonical(pr, status);
1731 }
1732 
1733 static float32 QEMU_SOFTFLOAT_ATTR
1734 soft_f32_div(float32 a, float32 b, float_status *status)
1735 {
1736     FloatParts pa = float32_unpack_canonical(a, status);
1737     FloatParts pb = float32_unpack_canonical(b, status);
1738     FloatParts pr = div_floats(pa, pb, status);
1739 
1740     return float32_round_pack_canonical(pr, status);
1741 }
1742 
1743 static float64 QEMU_SOFTFLOAT_ATTR
1744 soft_f64_div(float64 a, float64 b, float_status *status)
1745 {
1746     FloatParts pa = float64_unpack_canonical(a, status);
1747     FloatParts pb = float64_unpack_canonical(b, status);
1748     FloatParts pr = div_floats(pa, pb, status);
1749 
1750     return float64_round_pack_canonical(pr, status);
1751 }
1752 
1753 static float hard_f32_div(float a, float b)
1754 {
1755     return a / b;
1756 }
1757 
1758 static double hard_f64_div(double a, double b)
1759 {
1760     return a / b;
1761 }
1762 
1763 static bool f32_div_pre(union_float32 a, union_float32 b)
1764 {
1765     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1766         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1767                fpclassify(b.h) == FP_NORMAL;
1768     }
1769     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1770 }
1771 
1772 static bool f64_div_pre(union_float64 a, union_float64 b)
1773 {
1774     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1775         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1776                fpclassify(b.h) == FP_NORMAL;
1777     }
1778     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1779 }
1780 
1781 static bool f32_div_post(union_float32 a, union_float32 b)
1782 {
1783     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1784         return fpclassify(a.h) != FP_ZERO;
1785     }
1786     return !float32_is_zero(a.s);
1787 }
1788 
1789 static bool f64_div_post(union_float64 a, union_float64 b)
1790 {
1791     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1792         return fpclassify(a.h) != FP_ZERO;
1793     }
1794     return !float64_is_zero(a.s);
1795 }
1796 
1797 float32 QEMU_FLATTEN
1798 float32_div(float32 a, float32 b, float_status *s)
1799 {
1800     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1801                         f32_div_pre, f32_div_post);
1802 }
1803 
1804 float64 QEMU_FLATTEN
1805 float64_div(float64 a, float64 b, float_status *s)
1806 {
1807     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1808                         f64_div_pre, f64_div_post);
1809 }
1810 
1811 /*
1812  * Float to Float conversions
1813  *
1814  * Returns the result of converting one float format to another. The
1815  * conversion is performed according to the IEC/IEEE Standard for
1816  * Binary Floating-Point Arithmetic.
1817  *
1818  * The float_to_float helper only needs to take care of raising
1819  * invalid exceptions and handling the conversion on NaNs.
1820  */
1821 
1822 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1823                                  float_status *s)
1824 {
1825     if (dstf->arm_althp) {
1826         switch (a.cls) {
1827         case float_class_qnan:
1828         case float_class_snan:
1829             /* There is no NaN in the destination format.  Raise Invalid
1830              * and return a zero with the sign of the input NaN.
1831              */
1832             s->float_exception_flags |= float_flag_invalid;
1833             a.cls = float_class_zero;
1834             a.frac = 0;
1835             a.exp = 0;
1836             break;
1837 
1838         case float_class_inf:
1839             /* There is no Inf in the destination format.  Raise Invalid
1840              * and return the maximum normal with the correct sign.
1841              */
1842             s->float_exception_flags |= float_flag_invalid;
1843             a.cls = float_class_normal;
1844             a.exp = dstf->exp_max;
1845             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1846             break;
1847 
1848         default:
1849             break;
1850         }
1851     } else if (is_nan(a.cls)) {
1852         if (is_snan(a.cls)) {
1853             s->float_exception_flags |= float_flag_invalid;
1854             a = parts_silence_nan(a, s);
1855         }
1856         if (s->default_nan_mode) {
1857             return parts_default_nan(s);
1858         }
1859     }
1860     return a;
1861 }
1862 
1863 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1864 {
1865     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1866     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1867     FloatParts pr = float_to_float(p, &float32_params, s);
1868     return float32_round_pack_canonical(pr, s);
1869 }
1870 
1871 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1872 {
1873     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1874     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1875     FloatParts pr = float_to_float(p, &float64_params, s);
1876     return float64_round_pack_canonical(pr, s);
1877 }
1878 
1879 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1880 {
1881     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1882     FloatParts p = float32_unpack_canonical(a, s);
1883     FloatParts pr = float_to_float(p, fmt16, s);
1884     return float16a_round_pack_canonical(pr, s, fmt16);
1885 }
1886 
1887 static float64 QEMU_SOFTFLOAT_ATTR
1888 soft_float32_to_float64(float32 a, float_status *s)
1889 {
1890     FloatParts p = float32_unpack_canonical(a, s);
1891     FloatParts pr = float_to_float(p, &float64_params, s);
1892     return float64_round_pack_canonical(pr, s);
1893 }
1894 
1895 float64 float32_to_float64(float32 a, float_status *s)
1896 {
1897     if (likely(float32_is_normal(a))) {
1898         /* Widening conversion can never produce inexact results.  */
1899         union_float32 uf;
1900         union_float64 ud;
1901         uf.s = a;
1902         ud.h = uf.h;
1903         return ud.s;
1904     } else if (float32_is_zero(a)) {
1905         return float64_set_sign(float64_zero, float32_is_neg(a));
1906     } else {
1907         return soft_float32_to_float64(a, s);
1908     }
1909 }
1910 
1911 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1912 {
1913     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1914     FloatParts p = float64_unpack_canonical(a, s);
1915     FloatParts pr = float_to_float(p, fmt16, s);
1916     return float16a_round_pack_canonical(pr, s, fmt16);
1917 }
1918 
1919 float32 float64_to_float32(float64 a, float_status *s)
1920 {
1921     FloatParts p = float64_unpack_canonical(a, s);
1922     FloatParts pr = float_to_float(p, &float32_params, s);
1923     return float32_round_pack_canonical(pr, s);
1924 }
1925 
1926 /*
1927  * Rounds the floating-point value `a' to an integer, and returns the
1928  * result as a floating-point value. The operation is performed
1929  * according to the IEC/IEEE Standard for Binary Floating-Point
1930  * Arithmetic.
1931  */
1932 
1933 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
1934                                int scale, float_status *s)
1935 {
1936     switch (a.cls) {
1937     case float_class_qnan:
1938     case float_class_snan:
1939         return return_nan(a, s);
1940 
1941     case float_class_zero:
1942     case float_class_inf:
1943         /* already "integral" */
1944         break;
1945 
1946     case float_class_normal:
1947         scale = MIN(MAX(scale, -0x10000), 0x10000);
1948         a.exp += scale;
1949 
1950         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1951             /* already integral */
1952             break;
1953         }
1954         if (a.exp < 0) {
1955             bool one;
1956             /* all fractional */
1957             s->float_exception_flags |= float_flag_inexact;
1958             switch (rmode) {
1959             case float_round_nearest_even:
1960                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1961                 break;
1962             case float_round_ties_away:
1963                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1964                 break;
1965             case float_round_to_zero:
1966                 one = false;
1967                 break;
1968             case float_round_up:
1969                 one = !a.sign;
1970                 break;
1971             case float_round_down:
1972                 one = a.sign;
1973                 break;
1974             case float_round_to_odd:
1975                 one = true;
1976                 break;
1977             default:
1978                 g_assert_not_reached();
1979             }
1980 
1981             if (one) {
1982                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1983                 a.exp = 0;
1984             } else {
1985                 a.cls = float_class_zero;
1986             }
1987         } else {
1988             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1989             uint64_t frac_lsbm1 = frac_lsb >> 1;
1990             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1991             uint64_t rnd_mask = rnd_even_mask >> 1;
1992             uint64_t inc;
1993 
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1997                 break;
1998             case float_round_ties_away:
1999                 inc = frac_lsbm1;
2000                 break;
2001             case float_round_to_zero:
2002                 inc = 0;
2003                 break;
2004             case float_round_up:
2005                 inc = a.sign ? 0 : rnd_mask;
2006                 break;
2007             case float_round_down:
2008                 inc = a.sign ? rnd_mask : 0;
2009                 break;
2010             case float_round_to_odd:
2011                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (a.frac & rnd_mask) {
2018                 s->float_exception_flags |= float_flag_inexact;
2019                 a.frac += inc;
2020                 a.frac &= ~rnd_mask;
2021                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2022                     a.frac >>= 1;
2023                     a.exp++;
2024                 }
2025             }
2026         }
2027         break;
2028     default:
2029         g_assert_not_reached();
2030     }
2031     return a;
2032 }
2033 
2034 float16 float16_round_to_int(float16 a, float_status *s)
2035 {
2036     FloatParts pa = float16_unpack_canonical(a, s);
2037     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2038     return float16_round_pack_canonical(pr, s);
2039 }
2040 
2041 float32 float32_round_to_int(float32 a, float_status *s)
2042 {
2043     FloatParts pa = float32_unpack_canonical(a, s);
2044     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2045     return float32_round_pack_canonical(pr, s);
2046 }
2047 
2048 float64 float64_round_to_int(float64 a, float_status *s)
2049 {
2050     FloatParts pa = float64_unpack_canonical(a, s);
2051     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2052     return float64_round_pack_canonical(pr, s);
2053 }
2054 
2055 /*
2056  * Returns the result of converting the floating-point value `a' to
2057  * the two's complement integer format. The conversion is performed
2058  * according to the IEC/IEEE Standard for Binary Floating-Point
2059  * Arithmetic---which means in particular that the conversion is
2060  * rounded according to the current rounding mode. If `a' is a NaN,
2061  * the largest positive integer is returned. Otherwise, if the
2062  * conversion overflows, the largest integer with the same sign as `a'
2063  * is returned.
2064 */
2065 
2066 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2067                                      int scale, int64_t min, int64_t max,
2068                                      float_status *s)
2069 {
2070     uint64_t r;
2071     int orig_flags = get_float_exception_flags(s);
2072     FloatParts p = round_to_int(in, rmode, scale, s);
2073 
2074     switch (p.cls) {
2075     case float_class_snan:
2076     case float_class_qnan:
2077         s->float_exception_flags = orig_flags | float_flag_invalid;
2078         return max;
2079     case float_class_inf:
2080         s->float_exception_flags = orig_flags | float_flag_invalid;
2081         return p.sign ? min : max;
2082     case float_class_zero:
2083         return 0;
2084     case float_class_normal:
2085         if (p.exp < DECOMPOSED_BINARY_POINT) {
2086             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2087         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2088             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2089         } else {
2090             r = UINT64_MAX;
2091         }
2092         if (p.sign) {
2093             if (r <= -(uint64_t) min) {
2094                 return -r;
2095             } else {
2096                 s->float_exception_flags = orig_flags | float_flag_invalid;
2097                 return min;
2098             }
2099         } else {
2100             if (r <= max) {
2101                 return r;
2102             } else {
2103                 s->float_exception_flags = orig_flags | float_flag_invalid;
2104                 return max;
2105             }
2106         }
2107     default:
2108         g_assert_not_reached();
2109     }
2110 }
2111 
2112 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2113                                 float_status *s)
2114 {
2115     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2116                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2117 }
2118 
2119 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2120                                 float_status *s)
2121 {
2122     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2123                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2124 }
2125 
2126 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2127                                 float_status *s)
2128 {
2129     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2130                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2131 }
2132 
2133 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2134                                 float_status *s)
2135 {
2136     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2137                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2138 }
2139 
2140 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2141                                 float_status *s)
2142 {
2143     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2144                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2145 }
2146 
2147 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2148                                 float_status *s)
2149 {
2150     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2151                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2152 }
2153 
2154 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2155                                 float_status *s)
2156 {
2157     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2158                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2159 }
2160 
2161 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2162                                 float_status *s)
2163 {
2164     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2165                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2166 }
2167 
2168 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2169                                 float_status *s)
2170 {
2171     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2172                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2173 }
2174 
2175 int16_t float16_to_int16(float16 a, float_status *s)
2176 {
2177     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2178 }
2179 
2180 int32_t float16_to_int32(float16 a, float_status *s)
2181 {
2182     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2183 }
2184 
2185 int64_t float16_to_int64(float16 a, float_status *s)
2186 {
2187     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2188 }
2189 
2190 int16_t float32_to_int16(float32 a, float_status *s)
2191 {
2192     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2193 }
2194 
2195 int32_t float32_to_int32(float32 a, float_status *s)
2196 {
2197     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2198 }
2199 
2200 int64_t float32_to_int64(float32 a, float_status *s)
2201 {
2202     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2203 }
2204 
2205 int16_t float64_to_int16(float64 a, float_status *s)
2206 {
2207     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2208 }
2209 
2210 int32_t float64_to_int32(float64 a, float_status *s)
2211 {
2212     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2213 }
2214 
2215 int64_t float64_to_int64(float64 a, float_status *s)
2216 {
2217     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2218 }
2219 
2220 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2221 {
2222     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2223 }
2224 
2225 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2226 {
2227     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2228 }
2229 
2230 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2231 {
2232     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2233 }
2234 
2235 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2236 {
2237     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2238 }
2239 
2240 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2241 {
2242     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2243 }
2244 
2245 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2246 {
2247     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2248 }
2249 
2250 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2251 {
2252     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2253 }
2254 
2255 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2256 {
2257     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2258 }
2259 
2260 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2261 {
2262     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2263 }
2264 
2265 /*
2266  *  Returns the result of converting the floating-point value `a' to
2267  *  the unsigned integer format. The conversion is performed according
2268  *  to the IEC/IEEE Standard for Binary Floating-Point
2269  *  Arithmetic---which means in particular that the conversion is
2270  *  rounded according to the current rounding mode. If `a' is a NaN,
2271  *  the largest unsigned integer is returned. Otherwise, if the
2272  *  conversion overflows, the largest unsigned integer is returned. If
2273  *  the 'a' is negative, the result is rounded and zero is returned;
2274  *  values that do not round to zero will raise the inexact exception
2275  *  flag.
2276  */
2277 
2278 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2279                                        int scale, uint64_t max,
2280                                        float_status *s)
2281 {
2282     int orig_flags = get_float_exception_flags(s);
2283     FloatParts p = round_to_int(in, rmode, scale, s);
2284     uint64_t r;
2285 
2286     switch (p.cls) {
2287     case float_class_snan:
2288     case float_class_qnan:
2289         s->float_exception_flags = orig_flags | float_flag_invalid;
2290         return max;
2291     case float_class_inf:
2292         s->float_exception_flags = orig_flags | float_flag_invalid;
2293         return p.sign ? 0 : max;
2294     case float_class_zero:
2295         return 0;
2296     case float_class_normal:
2297         if (p.sign) {
2298             s->float_exception_flags = orig_flags | float_flag_invalid;
2299             return 0;
2300         }
2301 
2302         if (p.exp < DECOMPOSED_BINARY_POINT) {
2303             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2304         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2305             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2306         } else {
2307             s->float_exception_flags = orig_flags | float_flag_invalid;
2308             return max;
2309         }
2310 
2311         /* For uint64 this will never trip, but if p.exp is too large
2312          * to shift a decomposed fraction we shall have exited via the
2313          * 3rd leg above.
2314          */
2315         if (r > max) {
2316             s->float_exception_flags = orig_flags | float_flag_invalid;
2317             return max;
2318         }
2319         return r;
2320     default:
2321         g_assert_not_reached();
2322     }
2323 }
2324 
2325 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2326                                   float_status *s)
2327 {
2328     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2329                                   rmode, scale, UINT16_MAX, s);
2330 }
2331 
2332 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2333                                   float_status *s)
2334 {
2335     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2336                                   rmode, scale, UINT32_MAX, s);
2337 }
2338 
2339 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2340                                   float_status *s)
2341 {
2342     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2343                                   rmode, scale, UINT64_MAX, s);
2344 }
2345 
2346 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2347                                   float_status *s)
2348 {
2349     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2350                                   rmode, scale, UINT16_MAX, s);
2351 }
2352 
2353 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2354                                   float_status *s)
2355 {
2356     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2357                                   rmode, scale, UINT32_MAX, s);
2358 }
2359 
2360 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2364                                   rmode, scale, UINT64_MAX, s);
2365 }
2366 
2367 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2371                                   rmode, scale, UINT16_MAX, s);
2372 }
2373 
2374 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2378                                   rmode, scale, UINT32_MAX, s);
2379 }
2380 
2381 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2385                                   rmode, scale, UINT64_MAX, s);
2386 }
2387 
2388 uint16_t float16_to_uint16(float16 a, float_status *s)
2389 {
2390     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2391 }
2392 
2393 uint32_t float16_to_uint32(float16 a, float_status *s)
2394 {
2395     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2396 }
2397 
2398 uint64_t float16_to_uint64(float16 a, float_status *s)
2399 {
2400     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2401 }
2402 
2403 uint16_t float32_to_uint16(float32 a, float_status *s)
2404 {
2405     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2406 }
2407 
2408 uint32_t float32_to_uint32(float32 a, float_status *s)
2409 {
2410     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2411 }
2412 
2413 uint64_t float32_to_uint64(float32 a, float_status *s)
2414 {
2415     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2416 }
2417 
2418 uint16_t float64_to_uint16(float64 a, float_status *s)
2419 {
2420     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2421 }
2422 
2423 uint32_t float64_to_uint32(float64 a, float_status *s)
2424 {
2425     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint64_t float64_to_uint64(float64 a, float_status *s)
2429 {
2430     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2434 {
2435     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2436 }
2437 
2438 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2439 {
2440     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2441 }
2442 
2443 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2444 {
2445     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2446 }
2447 
2448 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2449 {
2450     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2451 }
2452 
2453 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2454 {
2455     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2456 }
2457 
2458 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2459 {
2460     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2461 }
2462 
2463 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2464 {
2465     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2466 }
2467 
2468 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2469 {
2470     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2474 {
2475     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 /*
2479  * Integer to float conversions
2480  *
2481  * Returns the result of converting the two's complement integer `a'
2482  * to the floating-point format. The conversion is performed according
2483  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2484  */
2485 
2486 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2487 {
2488     FloatParts r = { .sign = false };
2489 
2490     if (a == 0) {
2491         r.cls = float_class_zero;
2492     } else {
2493         uint64_t f = a;
2494         int shift;
2495 
2496         r.cls = float_class_normal;
2497         if (a < 0) {
2498             f = -f;
2499             r.sign = true;
2500         }
2501         shift = clz64(f) - 1;
2502         scale = MIN(MAX(scale, -0x10000), 0x10000);
2503 
2504         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2505         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2506     }
2507 
2508     return r;
2509 }
2510 
2511 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2512 {
2513     FloatParts pa = int_to_float(a, scale, status);
2514     return float16_round_pack_canonical(pa, status);
2515 }
2516 
2517 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2518 {
2519     return int64_to_float16_scalbn(a, scale, status);
2520 }
2521 
2522 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2523 {
2524     return int64_to_float16_scalbn(a, scale, status);
2525 }
2526 
2527 float16 int64_to_float16(int64_t a, float_status *status)
2528 {
2529     return int64_to_float16_scalbn(a, 0, status);
2530 }
2531 
2532 float16 int32_to_float16(int32_t a, float_status *status)
2533 {
2534     return int64_to_float16_scalbn(a, 0, status);
2535 }
2536 
2537 float16 int16_to_float16(int16_t a, float_status *status)
2538 {
2539     return int64_to_float16_scalbn(a, 0, status);
2540 }
2541 
2542 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2543 {
2544     FloatParts pa = int_to_float(a, scale, status);
2545     return float32_round_pack_canonical(pa, status);
2546 }
2547 
2548 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2549 {
2550     return int64_to_float32_scalbn(a, scale, status);
2551 }
2552 
2553 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2554 {
2555     return int64_to_float32_scalbn(a, scale, status);
2556 }
2557 
2558 float32 int64_to_float32(int64_t a, float_status *status)
2559 {
2560     return int64_to_float32_scalbn(a, 0, status);
2561 }
2562 
2563 float32 int32_to_float32(int32_t a, float_status *status)
2564 {
2565     return int64_to_float32_scalbn(a, 0, status);
2566 }
2567 
2568 float32 int16_to_float32(int16_t a, float_status *status)
2569 {
2570     return int64_to_float32_scalbn(a, 0, status);
2571 }
2572 
2573 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2574 {
2575     FloatParts pa = int_to_float(a, scale, status);
2576     return float64_round_pack_canonical(pa, status);
2577 }
2578 
2579 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2580 {
2581     return int64_to_float64_scalbn(a, scale, status);
2582 }
2583 
2584 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2585 {
2586     return int64_to_float64_scalbn(a, scale, status);
2587 }
2588 
2589 float64 int64_to_float64(int64_t a, float_status *status)
2590 {
2591     return int64_to_float64_scalbn(a, 0, status);
2592 }
2593 
2594 float64 int32_to_float64(int32_t a, float_status *status)
2595 {
2596     return int64_to_float64_scalbn(a, 0, status);
2597 }
2598 
2599 float64 int16_to_float64(int16_t a, float_status *status)
2600 {
2601     return int64_to_float64_scalbn(a, 0, status);
2602 }
2603 
2604 
2605 /*
2606  * Unsigned Integer to float conversions
2607  *
2608  * Returns the result of converting the unsigned integer `a' to the
2609  * floating-point format. The conversion is performed according to the
2610  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611  */
2612 
2613 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2614 {
2615     FloatParts r = { .sign = false };
2616 
2617     if (a == 0) {
2618         r.cls = float_class_zero;
2619     } else {
2620         scale = MIN(MAX(scale, -0x10000), 0x10000);
2621         r.cls = float_class_normal;
2622         if ((int64_t)a < 0) {
2623             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2624             shift64RightJamming(a, 1, &a);
2625             r.frac = a;
2626         } else {
2627             int shift = clz64(a) - 1;
2628             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2629             r.frac = a << shift;
2630         }
2631     }
2632 
2633     return r;
2634 }
2635 
2636 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2637 {
2638     FloatParts pa = uint_to_float(a, scale, status);
2639     return float16_round_pack_canonical(pa, status);
2640 }
2641 
2642 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2643 {
2644     return uint64_to_float16_scalbn(a, scale, status);
2645 }
2646 
2647 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2648 {
2649     return uint64_to_float16_scalbn(a, scale, status);
2650 }
2651 
2652 float16 uint64_to_float16(uint64_t a, float_status *status)
2653 {
2654     return uint64_to_float16_scalbn(a, 0, status);
2655 }
2656 
2657 float16 uint32_to_float16(uint32_t a, float_status *status)
2658 {
2659     return uint64_to_float16_scalbn(a, 0, status);
2660 }
2661 
2662 float16 uint16_to_float16(uint16_t a, float_status *status)
2663 {
2664     return uint64_to_float16_scalbn(a, 0, status);
2665 }
2666 
2667 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2668 {
2669     FloatParts pa = uint_to_float(a, scale, status);
2670     return float32_round_pack_canonical(pa, status);
2671 }
2672 
2673 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2674 {
2675     return uint64_to_float32_scalbn(a, scale, status);
2676 }
2677 
2678 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2679 {
2680     return uint64_to_float32_scalbn(a, scale, status);
2681 }
2682 
2683 float32 uint64_to_float32(uint64_t a, float_status *status)
2684 {
2685     return uint64_to_float32_scalbn(a, 0, status);
2686 }
2687 
2688 float32 uint32_to_float32(uint32_t a, float_status *status)
2689 {
2690     return uint64_to_float32_scalbn(a, 0, status);
2691 }
2692 
2693 float32 uint16_to_float32(uint16_t a, float_status *status)
2694 {
2695     return uint64_to_float32_scalbn(a, 0, status);
2696 }
2697 
2698 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2699 {
2700     FloatParts pa = uint_to_float(a, scale, status);
2701     return float64_round_pack_canonical(pa, status);
2702 }
2703 
2704 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2705 {
2706     return uint64_to_float64_scalbn(a, scale, status);
2707 }
2708 
2709 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2710 {
2711     return uint64_to_float64_scalbn(a, scale, status);
2712 }
2713 
2714 float64 uint64_to_float64(uint64_t a, float_status *status)
2715 {
2716     return uint64_to_float64_scalbn(a, 0, status);
2717 }
2718 
2719 float64 uint32_to_float64(uint32_t a, float_status *status)
2720 {
2721     return uint64_to_float64_scalbn(a, 0, status);
2722 }
2723 
2724 float64 uint16_to_float64(uint16_t a, float_status *status)
2725 {
2726     return uint64_to_float64_scalbn(a, 0, status);
2727 }
2728 
2729 /* Float Min/Max */
2730 /* min() and max() functions. These can't be implemented as
2731  * 'compare and pick one input' because that would mishandle
2732  * NaNs and +0 vs -0.
2733  *
2734  * minnum() and maxnum() functions. These are similar to the min()
2735  * and max() functions but if one of the arguments is a QNaN and
2736  * the other is numerical then the numerical argument is returned.
2737  * SNaNs will get quietened before being returned.
2738  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2739  * and maxNum() operations. min() and max() are the typical min/max
2740  * semantics provided by many CPUs which predate that specification.
2741  *
2742  * minnummag() and maxnummag() functions correspond to minNumMag()
2743  * and minNumMag() from the IEEE-754 2008.
2744  */
2745 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2746                                 bool ieee, bool ismag, float_status *s)
2747 {
2748     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2749         if (ieee) {
2750             /* Takes two floating-point values `a' and `b', one of
2751              * which is a NaN, and returns the appropriate NaN
2752              * result. If either `a' or `b' is a signaling NaN,
2753              * the invalid exception is raised.
2754              */
2755             if (is_snan(a.cls) || is_snan(b.cls)) {
2756                 return pick_nan(a, b, s);
2757             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2758                 return b;
2759             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2760                 return a;
2761             }
2762         }
2763         return pick_nan(a, b, s);
2764     } else {
2765         int a_exp, b_exp;
2766 
2767         switch (a.cls) {
2768         case float_class_normal:
2769             a_exp = a.exp;
2770             break;
2771         case float_class_inf:
2772             a_exp = INT_MAX;
2773             break;
2774         case float_class_zero:
2775             a_exp = INT_MIN;
2776             break;
2777         default:
2778             g_assert_not_reached();
2779             break;
2780         }
2781         switch (b.cls) {
2782         case float_class_normal:
2783             b_exp = b.exp;
2784             break;
2785         case float_class_inf:
2786             b_exp = INT_MAX;
2787             break;
2788         case float_class_zero:
2789             b_exp = INT_MIN;
2790             break;
2791         default:
2792             g_assert_not_reached();
2793             break;
2794         }
2795 
2796         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2797             bool a_less = a_exp < b_exp;
2798             if (a_exp == b_exp) {
2799                 a_less = a.frac < b.frac;
2800             }
2801             return a_less ^ ismin ? b : a;
2802         }
2803 
2804         if (a.sign == b.sign) {
2805             bool a_less = a_exp < b_exp;
2806             if (a_exp == b_exp) {
2807                 a_less = a.frac < b.frac;
2808             }
2809             return a.sign ^ a_less ^ ismin ? b : a;
2810         } else {
2811             return a.sign ^ ismin ? b : a;
2812         }
2813     }
2814 }
2815 
2816 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2817 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2818                                      float_status *s)                   \
2819 {                                                                       \
2820     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2821     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2822     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2823                                                                         \
2824     return float ## sz ## _round_pack_canonical(pr, s);                 \
2825 }
2826 
2827 MINMAX(16, min, true, false, false)
2828 MINMAX(16, minnum, true, true, false)
2829 MINMAX(16, minnummag, true, true, true)
2830 MINMAX(16, max, false, false, false)
2831 MINMAX(16, maxnum, false, true, false)
2832 MINMAX(16, maxnummag, false, true, true)
2833 
2834 MINMAX(32, min, true, false, false)
2835 MINMAX(32, minnum, true, true, false)
2836 MINMAX(32, minnummag, true, true, true)
2837 MINMAX(32, max, false, false, false)
2838 MINMAX(32, maxnum, false, true, false)
2839 MINMAX(32, maxnummag, false, true, true)
2840 
2841 MINMAX(64, min, true, false, false)
2842 MINMAX(64, minnum, true, true, false)
2843 MINMAX(64, minnummag, true, true, true)
2844 MINMAX(64, max, false, false, false)
2845 MINMAX(64, maxnum, false, true, false)
2846 MINMAX(64, maxnummag, false, true, true)
2847 
2848 #undef MINMAX
2849 
2850 /* Floating point compare */
2851 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2852                                     float_status *s)
2853 {
2854     if (is_nan(a.cls) || is_nan(b.cls)) {
2855         if (!is_quiet ||
2856             a.cls == float_class_snan ||
2857             b.cls == float_class_snan) {
2858             s->float_exception_flags |= float_flag_invalid;
2859         }
2860         return float_relation_unordered;
2861     }
2862 
2863     if (a.cls == float_class_zero) {
2864         if (b.cls == float_class_zero) {
2865             return float_relation_equal;
2866         }
2867         return b.sign ? float_relation_greater : float_relation_less;
2868     } else if (b.cls == float_class_zero) {
2869         return a.sign ? float_relation_less : float_relation_greater;
2870     }
2871 
2872     /* The only really important thing about infinity is its sign. If
2873      * both are infinities the sign marks the smallest of the two.
2874      */
2875     if (a.cls == float_class_inf) {
2876         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2877             return float_relation_equal;
2878         }
2879         return a.sign ? float_relation_less : float_relation_greater;
2880     } else if (b.cls == float_class_inf) {
2881         return b.sign ? float_relation_greater : float_relation_less;
2882     }
2883 
2884     if (a.sign != b.sign) {
2885         return a.sign ? float_relation_less : float_relation_greater;
2886     }
2887 
2888     if (a.exp == b.exp) {
2889         if (a.frac == b.frac) {
2890             return float_relation_equal;
2891         }
2892         if (a.sign) {
2893             return a.frac > b.frac ?
2894                 float_relation_less : float_relation_greater;
2895         } else {
2896             return a.frac > b.frac ?
2897                 float_relation_greater : float_relation_less;
2898         }
2899     } else {
2900         if (a.sign) {
2901             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2902         } else {
2903             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2904         }
2905     }
2906 }
2907 
2908 #define COMPARE(name, attr, sz)                                         \
2909 static int attr                                                         \
2910 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2911 {                                                                       \
2912     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2913     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2914     return compare_floats(pa, pb, is_quiet, s);                         \
2915 }
2916 
2917 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2918 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2919 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2920 
2921 #undef COMPARE
2922 
2923 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
2924 {
2925     return soft_f16_compare(a, b, false, s);
2926 }
2927 
2928 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
2929 {
2930     return soft_f16_compare(a, b, true, s);
2931 }
2932 
2933 static FloatRelation QEMU_FLATTEN
2934 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2935 {
2936     union_float32 ua, ub;
2937 
2938     ua.s = xa;
2939     ub.s = xb;
2940 
2941     if (QEMU_NO_HARDFLOAT) {
2942         goto soft;
2943     }
2944 
2945     float32_input_flush2(&ua.s, &ub.s, s);
2946     if (isgreaterequal(ua.h, ub.h)) {
2947         if (isgreater(ua.h, ub.h)) {
2948             return float_relation_greater;
2949         }
2950         return float_relation_equal;
2951     }
2952     if (likely(isless(ua.h, ub.h))) {
2953         return float_relation_less;
2954     }
2955     /* The only condition remaining is unordered.
2956      * Fall through to set flags.
2957      */
2958  soft:
2959     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2960 }
2961 
2962 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
2963 {
2964     return f32_compare(a, b, false, s);
2965 }
2966 
2967 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
2968 {
2969     return f32_compare(a, b, true, s);
2970 }
2971 
2972 static FloatRelation QEMU_FLATTEN
2973 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2974 {
2975     union_float64 ua, ub;
2976 
2977     ua.s = xa;
2978     ub.s = xb;
2979 
2980     if (QEMU_NO_HARDFLOAT) {
2981         goto soft;
2982     }
2983 
2984     float64_input_flush2(&ua.s, &ub.s, s);
2985     if (isgreaterequal(ua.h, ub.h)) {
2986         if (isgreater(ua.h, ub.h)) {
2987             return float_relation_greater;
2988         }
2989         return float_relation_equal;
2990     }
2991     if (likely(isless(ua.h, ub.h))) {
2992         return float_relation_less;
2993     }
2994     /* The only condition remaining is unordered.
2995      * Fall through to set flags.
2996      */
2997  soft:
2998     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
2999 }
3000 
3001 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3002 {
3003     return f64_compare(a, b, false, s);
3004 }
3005 
3006 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3007 {
3008     return f64_compare(a, b, true, s);
3009 }
3010 
3011 /* Multiply A by 2 raised to the power N.  */
3012 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3013 {
3014     if (unlikely(is_nan(a.cls))) {
3015         return return_nan(a, s);
3016     }
3017     if (a.cls == float_class_normal) {
3018         /* The largest float type (even though not supported by FloatParts)
3019          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3020          * still allows rounding to infinity, without allowing overflow
3021          * within the int32_t that backs FloatParts.exp.
3022          */
3023         n = MIN(MAX(n, -0x10000), 0x10000);
3024         a.exp += n;
3025     }
3026     return a;
3027 }
3028 
3029 float16 float16_scalbn(float16 a, int n, float_status *status)
3030 {
3031     FloatParts pa = float16_unpack_canonical(a, status);
3032     FloatParts pr = scalbn_decomposed(pa, n, status);
3033     return float16_round_pack_canonical(pr, status);
3034 }
3035 
3036 float32 float32_scalbn(float32 a, int n, float_status *status)
3037 {
3038     FloatParts pa = float32_unpack_canonical(a, status);
3039     FloatParts pr = scalbn_decomposed(pa, n, status);
3040     return float32_round_pack_canonical(pr, status);
3041 }
3042 
3043 float64 float64_scalbn(float64 a, int n, float_status *status)
3044 {
3045     FloatParts pa = float64_unpack_canonical(a, status);
3046     FloatParts pr = scalbn_decomposed(pa, n, status);
3047     return float64_round_pack_canonical(pr, status);
3048 }
3049 
3050 /*
3051  * Square Root
3052  *
3053  * The old softfloat code did an approximation step before zeroing in
3054  * on the final result. However for simpleness we just compute the
3055  * square root by iterating down from the implicit bit to enough extra
3056  * bits to ensure we get a correctly rounded result.
3057  *
3058  * This does mean however the calculation is slower than before,
3059  * especially for 64 bit floats.
3060  */
3061 
3062 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3063 {
3064     uint64_t a_frac, r_frac, s_frac;
3065     int bit, last_bit;
3066 
3067     if (is_nan(a.cls)) {
3068         return return_nan(a, s);
3069     }
3070     if (a.cls == float_class_zero) {
3071         return a;  /* sqrt(+-0) = +-0 */
3072     }
3073     if (a.sign) {
3074         s->float_exception_flags |= float_flag_invalid;
3075         return parts_default_nan(s);
3076     }
3077     if (a.cls == float_class_inf) {
3078         return a;  /* sqrt(+inf) = +inf */
3079     }
3080 
3081     assert(a.cls == float_class_normal);
3082 
3083     /* We need two overflow bits at the top. Adding room for that is a
3084      * right shift. If the exponent is odd, we can discard the low bit
3085      * by multiplying the fraction by 2; that's a left shift. Combine
3086      * those and we shift right if the exponent is even.
3087      */
3088     a_frac = a.frac;
3089     if (!(a.exp & 1)) {
3090         a_frac >>= 1;
3091     }
3092     a.exp >>= 1;
3093 
3094     /* Bit-by-bit computation of sqrt.  */
3095     r_frac = 0;
3096     s_frac = 0;
3097 
3098     /* Iterate from implicit bit down to the 3 extra bits to compute a
3099      * properly rounded result. Remember we've inserted one more bit
3100      * at the top, so these positions are one less.
3101      */
3102     bit = DECOMPOSED_BINARY_POINT - 1;
3103     last_bit = MAX(p->frac_shift - 4, 0);
3104     do {
3105         uint64_t q = 1ULL << bit;
3106         uint64_t t_frac = s_frac + q;
3107         if (t_frac <= a_frac) {
3108             s_frac = t_frac + q;
3109             a_frac -= t_frac;
3110             r_frac += q;
3111         }
3112         a_frac <<= 1;
3113     } while (--bit >= last_bit);
3114 
3115     /* Undo the right shift done above. If there is any remaining
3116      * fraction, the result is inexact. Set the sticky bit.
3117      */
3118     a.frac = (r_frac << 1) + (a_frac != 0);
3119 
3120     return a;
3121 }
3122 
3123 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3124 {
3125     FloatParts pa = float16_unpack_canonical(a, status);
3126     FloatParts pr = sqrt_float(pa, status, &float16_params);
3127     return float16_round_pack_canonical(pr, status);
3128 }
3129 
3130 static float32 QEMU_SOFTFLOAT_ATTR
3131 soft_f32_sqrt(float32 a, float_status *status)
3132 {
3133     FloatParts pa = float32_unpack_canonical(a, status);
3134     FloatParts pr = sqrt_float(pa, status, &float32_params);
3135     return float32_round_pack_canonical(pr, status);
3136 }
3137 
3138 static float64 QEMU_SOFTFLOAT_ATTR
3139 soft_f64_sqrt(float64 a, float_status *status)
3140 {
3141     FloatParts pa = float64_unpack_canonical(a, status);
3142     FloatParts pr = sqrt_float(pa, status, &float64_params);
3143     return float64_round_pack_canonical(pr, status);
3144 }
3145 
3146 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3147 {
3148     union_float32 ua, ur;
3149 
3150     ua.s = xa;
3151     if (unlikely(!can_use_fpu(s))) {
3152         goto soft;
3153     }
3154 
3155     float32_input_flush1(&ua.s, s);
3156     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3157         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3158                        fpclassify(ua.h) == FP_ZERO) ||
3159                      signbit(ua.h))) {
3160             goto soft;
3161         }
3162     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3163                         float32_is_neg(ua.s))) {
3164         goto soft;
3165     }
3166     ur.h = sqrtf(ua.h);
3167     return ur.s;
3168 
3169  soft:
3170     return soft_f32_sqrt(ua.s, s);
3171 }
3172 
3173 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3174 {
3175     union_float64 ua, ur;
3176 
3177     ua.s = xa;
3178     if (unlikely(!can_use_fpu(s))) {
3179         goto soft;
3180     }
3181 
3182     float64_input_flush1(&ua.s, s);
3183     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3184         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3185                        fpclassify(ua.h) == FP_ZERO) ||
3186                      signbit(ua.h))) {
3187             goto soft;
3188         }
3189     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3190                         float64_is_neg(ua.s))) {
3191         goto soft;
3192     }
3193     ur.h = sqrt(ua.h);
3194     return ur.s;
3195 
3196  soft:
3197     return soft_f64_sqrt(ua.s, s);
3198 }
3199 
3200 /*----------------------------------------------------------------------------
3201 | The pattern for a default generated NaN.
3202 *----------------------------------------------------------------------------*/
3203 
3204 float16 float16_default_nan(float_status *status)
3205 {
3206     FloatParts p = parts_default_nan(status);
3207     p.frac >>= float16_params.frac_shift;
3208     return float16_pack_raw(p);
3209 }
3210 
3211 float32 float32_default_nan(float_status *status)
3212 {
3213     FloatParts p = parts_default_nan(status);
3214     p.frac >>= float32_params.frac_shift;
3215     return float32_pack_raw(p);
3216 }
3217 
3218 float64 float64_default_nan(float_status *status)
3219 {
3220     FloatParts p = parts_default_nan(status);
3221     p.frac >>= float64_params.frac_shift;
3222     return float64_pack_raw(p);
3223 }
3224 
3225 float128 float128_default_nan(float_status *status)
3226 {
3227     FloatParts p = parts_default_nan(status);
3228     float128 r;
3229 
3230     /* Extrapolate from the choices made by parts_default_nan to fill
3231      * in the quad-floating format.  If the low bit is set, assume we
3232      * want to set all non-snan bits.
3233      */
3234     r.low = -(p.frac & 1);
3235     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3236     r.high |= UINT64_C(0x7FFF000000000000);
3237     r.high |= (uint64_t)p.sign << 63;
3238 
3239     return r;
3240 }
3241 
3242 /*----------------------------------------------------------------------------
3243 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3244 *----------------------------------------------------------------------------*/
3245 
3246 float16 float16_silence_nan(float16 a, float_status *status)
3247 {
3248     FloatParts p = float16_unpack_raw(a);
3249     p.frac <<= float16_params.frac_shift;
3250     p = parts_silence_nan(p, status);
3251     p.frac >>= float16_params.frac_shift;
3252     return float16_pack_raw(p);
3253 }
3254 
3255 float32 float32_silence_nan(float32 a, float_status *status)
3256 {
3257     FloatParts p = float32_unpack_raw(a);
3258     p.frac <<= float32_params.frac_shift;
3259     p = parts_silence_nan(p, status);
3260     p.frac >>= float32_params.frac_shift;
3261     return float32_pack_raw(p);
3262 }
3263 
3264 float64 float64_silence_nan(float64 a, float_status *status)
3265 {
3266     FloatParts p = float64_unpack_raw(a);
3267     p.frac <<= float64_params.frac_shift;
3268     p = parts_silence_nan(p, status);
3269     p.frac >>= float64_params.frac_shift;
3270     return float64_pack_raw(p);
3271 }
3272 
3273 
3274 /*----------------------------------------------------------------------------
3275 | If `a' is denormal and we are in flush-to-zero mode then set the
3276 | input-denormal exception and return zero. Otherwise just return the value.
3277 *----------------------------------------------------------------------------*/
3278 
3279 static bool parts_squash_denormal(FloatParts p, float_status *status)
3280 {
3281     if (p.exp == 0 && p.frac != 0) {
3282         float_raise(float_flag_input_denormal, status);
3283         return true;
3284     }
3285 
3286     return false;
3287 }
3288 
3289 float16 float16_squash_input_denormal(float16 a, float_status *status)
3290 {
3291     if (status->flush_inputs_to_zero) {
3292         FloatParts p = float16_unpack_raw(a);
3293         if (parts_squash_denormal(p, status)) {
3294             return float16_set_sign(float16_zero, p.sign);
3295         }
3296     }
3297     return a;
3298 }
3299 
3300 float32 float32_squash_input_denormal(float32 a, float_status *status)
3301 {
3302     if (status->flush_inputs_to_zero) {
3303         FloatParts p = float32_unpack_raw(a);
3304         if (parts_squash_denormal(p, status)) {
3305             return float32_set_sign(float32_zero, p.sign);
3306         }
3307     }
3308     return a;
3309 }
3310 
3311 float64 float64_squash_input_denormal(float64 a, float_status *status)
3312 {
3313     if (status->flush_inputs_to_zero) {
3314         FloatParts p = float64_unpack_raw(a);
3315         if (parts_squash_denormal(p, status)) {
3316             return float64_set_sign(float64_zero, p.sign);
3317         }
3318     }
3319     return a;
3320 }
3321 
3322 /*----------------------------------------------------------------------------
3323 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3324 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3325 | input.  If `zSign' is 1, the input is negated before being converted to an
3326 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3327 | is simply rounded to an integer, with the inexact exception raised if the
3328 | input cannot be represented exactly as an integer.  However, if the fixed-
3329 | point input is too large, the invalid exception is raised and the largest
3330 | positive or negative integer is returned.
3331 *----------------------------------------------------------------------------*/
3332 
3333 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3334                                  float_status *status)
3335 {
3336     int8_t roundingMode;
3337     bool roundNearestEven;
3338     int8_t roundIncrement, roundBits;
3339     int32_t z;
3340 
3341     roundingMode = status->float_rounding_mode;
3342     roundNearestEven = ( roundingMode == float_round_nearest_even );
3343     switch (roundingMode) {
3344     case float_round_nearest_even:
3345     case float_round_ties_away:
3346         roundIncrement = 0x40;
3347         break;
3348     case float_round_to_zero:
3349         roundIncrement = 0;
3350         break;
3351     case float_round_up:
3352         roundIncrement = zSign ? 0 : 0x7f;
3353         break;
3354     case float_round_down:
3355         roundIncrement = zSign ? 0x7f : 0;
3356         break;
3357     case float_round_to_odd:
3358         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3359         break;
3360     default:
3361         abort();
3362     }
3363     roundBits = absZ & 0x7F;
3364     absZ = ( absZ + roundIncrement )>>7;
3365     if (!(roundBits ^ 0x40) && roundNearestEven) {
3366         absZ &= ~1;
3367     }
3368     z = absZ;
3369     if ( zSign ) z = - z;
3370     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3371         float_raise(float_flag_invalid, status);
3372         return zSign ? INT32_MIN : INT32_MAX;
3373     }
3374     if (roundBits) {
3375         status->float_exception_flags |= float_flag_inexact;
3376     }
3377     return z;
3378 
3379 }
3380 
3381 /*----------------------------------------------------------------------------
3382 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3383 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3384 | and returns the properly rounded 64-bit integer corresponding to the input.
3385 | If `zSign' is 1, the input is negated before being converted to an integer.
3386 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3387 | the inexact exception raised if the input cannot be represented exactly as
3388 | an integer.  However, if the fixed-point input is too large, the invalid
3389 | exception is raised and the largest positive or negative integer is
3390 | returned.
3391 *----------------------------------------------------------------------------*/
3392 
3393 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3394                                float_status *status)
3395 {
3396     int8_t roundingMode;
3397     bool roundNearestEven, increment;
3398     int64_t z;
3399 
3400     roundingMode = status->float_rounding_mode;
3401     roundNearestEven = ( roundingMode == float_round_nearest_even );
3402     switch (roundingMode) {
3403     case float_round_nearest_even:
3404     case float_round_ties_away:
3405         increment = ((int64_t) absZ1 < 0);
3406         break;
3407     case float_round_to_zero:
3408         increment = 0;
3409         break;
3410     case float_round_up:
3411         increment = !zSign && absZ1;
3412         break;
3413     case float_round_down:
3414         increment = zSign && absZ1;
3415         break;
3416     case float_round_to_odd:
3417         increment = !(absZ0 & 1) && absZ1;
3418         break;
3419     default:
3420         abort();
3421     }
3422     if ( increment ) {
3423         ++absZ0;
3424         if ( absZ0 == 0 ) goto overflow;
3425         if (!(absZ1 << 1) && roundNearestEven) {
3426             absZ0 &= ~1;
3427         }
3428     }
3429     z = absZ0;
3430     if ( zSign ) z = - z;
3431     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3432  overflow:
3433         float_raise(float_flag_invalid, status);
3434         return zSign ? INT64_MIN : INT64_MAX;
3435     }
3436     if (absZ1) {
3437         status->float_exception_flags |= float_flag_inexact;
3438     }
3439     return z;
3440 
3441 }
3442 
3443 /*----------------------------------------------------------------------------
3444 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3445 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3446 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3447 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3448 | with the inexact exception raised if the input cannot be represented exactly
3449 | as an integer.  However, if the fixed-point input is too large, the invalid
3450 | exception is raised and the largest unsigned integer is returned.
3451 *----------------------------------------------------------------------------*/
3452 
3453 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3454                                 uint64_t absZ1, float_status *status)
3455 {
3456     int8_t roundingMode;
3457     bool roundNearestEven, increment;
3458 
3459     roundingMode = status->float_rounding_mode;
3460     roundNearestEven = (roundingMode == float_round_nearest_even);
3461     switch (roundingMode) {
3462     case float_round_nearest_even:
3463     case float_round_ties_away:
3464         increment = ((int64_t)absZ1 < 0);
3465         break;
3466     case float_round_to_zero:
3467         increment = 0;
3468         break;
3469     case float_round_up:
3470         increment = !zSign && absZ1;
3471         break;
3472     case float_round_down:
3473         increment = zSign && absZ1;
3474         break;
3475     case float_round_to_odd:
3476         increment = !(absZ0 & 1) && absZ1;
3477         break;
3478     default:
3479         abort();
3480     }
3481     if (increment) {
3482         ++absZ0;
3483         if (absZ0 == 0) {
3484             float_raise(float_flag_invalid, status);
3485             return UINT64_MAX;
3486         }
3487         if (!(absZ1 << 1) && roundNearestEven) {
3488             absZ0 &= ~1;
3489         }
3490     }
3491 
3492     if (zSign && absZ0) {
3493         float_raise(float_flag_invalid, status);
3494         return 0;
3495     }
3496 
3497     if (absZ1) {
3498         status->float_exception_flags |= float_flag_inexact;
3499     }
3500     return absZ0;
3501 }
3502 
3503 /*----------------------------------------------------------------------------
3504 | Normalizes the subnormal single-precision floating-point value represented
3505 | by the denormalized significand `aSig'.  The normalized exponent and
3506 | significand are stored at the locations pointed to by `zExpPtr' and
3507 | `zSigPtr', respectively.
3508 *----------------------------------------------------------------------------*/
3509 
3510 static void
3511  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3512 {
3513     int8_t shiftCount;
3514 
3515     shiftCount = clz32(aSig) - 8;
3516     *zSigPtr = aSig<<shiftCount;
3517     *zExpPtr = 1 - shiftCount;
3518 
3519 }
3520 
3521 /*----------------------------------------------------------------------------
3522 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3523 | and significand `zSig', and returns the proper single-precision floating-
3524 | point value corresponding to the abstract input.  Ordinarily, the abstract
3525 | value is simply rounded and packed into the single-precision format, with
3526 | the inexact exception raised if the abstract input cannot be represented
3527 | exactly.  However, if the abstract value is too large, the overflow and
3528 | inexact exceptions are raised and an infinity or maximal finite value is
3529 | returned.  If the abstract value is too small, the input value is rounded to
3530 | a subnormal number, and the underflow and inexact exceptions are raised if
3531 | the abstract input cannot be represented exactly as a subnormal single-
3532 | precision floating-point number.
3533 |     The input significand `zSig' has its binary point between bits 30
3534 | and 29, which is 7 bits to the left of the usual location.  This shifted
3535 | significand must be normalized or smaller.  If `zSig' is not normalized,
3536 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3537 | and it must not require rounding.  In the usual case that `zSig' is
3538 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3539 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3540 | Binary Floating-Point Arithmetic.
3541 *----------------------------------------------------------------------------*/
3542 
3543 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3544                                    float_status *status)
3545 {
3546     int8_t roundingMode;
3547     bool roundNearestEven;
3548     int8_t roundIncrement, roundBits;
3549     bool isTiny;
3550 
3551     roundingMode = status->float_rounding_mode;
3552     roundNearestEven = ( roundingMode == float_round_nearest_even );
3553     switch (roundingMode) {
3554     case float_round_nearest_even:
3555     case float_round_ties_away:
3556         roundIncrement = 0x40;
3557         break;
3558     case float_round_to_zero:
3559         roundIncrement = 0;
3560         break;
3561     case float_round_up:
3562         roundIncrement = zSign ? 0 : 0x7f;
3563         break;
3564     case float_round_down:
3565         roundIncrement = zSign ? 0x7f : 0;
3566         break;
3567     case float_round_to_odd:
3568         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3569         break;
3570     default:
3571         abort();
3572         break;
3573     }
3574     roundBits = zSig & 0x7F;
3575     if ( 0xFD <= (uint16_t) zExp ) {
3576         if (    ( 0xFD < zExp )
3577              || (    ( zExp == 0xFD )
3578                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3579            ) {
3580             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3581                                    roundIncrement != 0;
3582             float_raise(float_flag_overflow | float_flag_inexact, status);
3583             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3584         }
3585         if ( zExp < 0 ) {
3586             if (status->flush_to_zero) {
3587                 float_raise(float_flag_output_denormal, status);
3588                 return packFloat32(zSign, 0, 0);
3589             }
3590             isTiny = status->tininess_before_rounding
3591                   || (zExp < -1)
3592                   || (zSig + roundIncrement < 0x80000000);
3593             shift32RightJamming( zSig, - zExp, &zSig );
3594             zExp = 0;
3595             roundBits = zSig & 0x7F;
3596             if (isTiny && roundBits) {
3597                 float_raise(float_flag_underflow, status);
3598             }
3599             if (roundingMode == float_round_to_odd) {
3600                 /*
3601                  * For round-to-odd case, the roundIncrement depends on
3602                  * zSig which just changed.
3603                  */
3604                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3605             }
3606         }
3607     }
3608     if (roundBits) {
3609         status->float_exception_flags |= float_flag_inexact;
3610     }
3611     zSig = ( zSig + roundIncrement )>>7;
3612     if (!(roundBits ^ 0x40) && roundNearestEven) {
3613         zSig &= ~1;
3614     }
3615     if ( zSig == 0 ) zExp = 0;
3616     return packFloat32( zSign, zExp, zSig );
3617 
3618 }
3619 
3620 /*----------------------------------------------------------------------------
3621 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3622 | and significand `zSig', and returns the proper single-precision floating-
3623 | point value corresponding to the abstract input.  This routine is just like
3624 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3625 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3626 | floating-point exponent.
3627 *----------------------------------------------------------------------------*/
3628 
3629 static float32
3630  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3631                               float_status *status)
3632 {
3633     int8_t shiftCount;
3634 
3635     shiftCount = clz32(zSig) - 1;
3636     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3637                                status);
3638 
3639 }
3640 
3641 /*----------------------------------------------------------------------------
3642 | Normalizes the subnormal double-precision floating-point value represented
3643 | by the denormalized significand `aSig'.  The normalized exponent and
3644 | significand are stored at the locations pointed to by `zExpPtr' and
3645 | `zSigPtr', respectively.
3646 *----------------------------------------------------------------------------*/
3647 
3648 static void
3649  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3650 {
3651     int8_t shiftCount;
3652 
3653     shiftCount = clz64(aSig) - 11;
3654     *zSigPtr = aSig<<shiftCount;
3655     *zExpPtr = 1 - shiftCount;
3656 
3657 }
3658 
3659 /*----------------------------------------------------------------------------
3660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3661 | double-precision floating-point value, returning the result.  After being
3662 | shifted into the proper positions, the three fields are simply added
3663 | together to form the result.  This means that any integer portion of `zSig'
3664 | will be added into the exponent.  Since a properly normalized significand
3665 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3666 | than the desired result exponent whenever `zSig' is a complete, normalized
3667 | significand.
3668 *----------------------------------------------------------------------------*/
3669 
3670 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
3671 {
3672 
3673     return make_float64(
3674         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3675 
3676 }
3677 
3678 /*----------------------------------------------------------------------------
3679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3680 | and significand `zSig', and returns the proper double-precision floating-
3681 | point value corresponding to the abstract input.  Ordinarily, the abstract
3682 | value is simply rounded and packed into the double-precision format, with
3683 | the inexact exception raised if the abstract input cannot be represented
3684 | exactly.  However, if the abstract value is too large, the overflow and
3685 | inexact exceptions are raised and an infinity or maximal finite value is
3686 | returned.  If the abstract value is too small, the input value is rounded to
3687 | a subnormal number, and the underflow and inexact exceptions are raised if
3688 | the abstract input cannot be represented exactly as a subnormal double-
3689 | precision floating-point number.
3690 |     The input significand `zSig' has its binary point between bits 62
3691 | and 61, which is 10 bits to the left of the usual location.  This shifted
3692 | significand must be normalized or smaller.  If `zSig' is not normalized,
3693 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3694 | and it must not require rounding.  In the usual case that `zSig' is
3695 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3696 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3697 | Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699 
3700 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3701                                    float_status *status)
3702 {
3703     int8_t roundingMode;
3704     bool roundNearestEven;
3705     int roundIncrement, roundBits;
3706     bool isTiny;
3707 
3708     roundingMode = status->float_rounding_mode;
3709     roundNearestEven = ( roundingMode == float_round_nearest_even );
3710     switch (roundingMode) {
3711     case float_round_nearest_even:
3712     case float_round_ties_away:
3713         roundIncrement = 0x200;
3714         break;
3715     case float_round_to_zero:
3716         roundIncrement = 0;
3717         break;
3718     case float_round_up:
3719         roundIncrement = zSign ? 0 : 0x3ff;
3720         break;
3721     case float_round_down:
3722         roundIncrement = zSign ? 0x3ff : 0;
3723         break;
3724     case float_round_to_odd:
3725         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3726         break;
3727     default:
3728         abort();
3729     }
3730     roundBits = zSig & 0x3FF;
3731     if ( 0x7FD <= (uint16_t) zExp ) {
3732         if (    ( 0x7FD < zExp )
3733              || (    ( zExp == 0x7FD )
3734                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3735            ) {
3736             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3737                                    roundIncrement != 0;
3738             float_raise(float_flag_overflow | float_flag_inexact, status);
3739             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3740         }
3741         if ( zExp < 0 ) {
3742             if (status->flush_to_zero) {
3743                 float_raise(float_flag_output_denormal, status);
3744                 return packFloat64(zSign, 0, 0);
3745             }
3746             isTiny = status->tininess_before_rounding
3747                   || (zExp < -1)
3748                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
3749             shift64RightJamming( zSig, - zExp, &zSig );
3750             zExp = 0;
3751             roundBits = zSig & 0x3FF;
3752             if (isTiny && roundBits) {
3753                 float_raise(float_flag_underflow, status);
3754             }
3755             if (roundingMode == float_round_to_odd) {
3756                 /*
3757                  * For round-to-odd case, the roundIncrement depends on
3758                  * zSig which just changed.
3759                  */
3760                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3761             }
3762         }
3763     }
3764     if (roundBits) {
3765         status->float_exception_flags |= float_flag_inexact;
3766     }
3767     zSig = ( zSig + roundIncrement )>>10;
3768     if (!(roundBits ^ 0x200) && roundNearestEven) {
3769         zSig &= ~1;
3770     }
3771     if ( zSig == 0 ) zExp = 0;
3772     return packFloat64( zSign, zExp, zSig );
3773 
3774 }
3775 
3776 /*----------------------------------------------------------------------------
3777 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3778 | and significand `zSig', and returns the proper double-precision floating-
3779 | point value corresponding to the abstract input.  This routine is just like
3780 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3781 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3782 | floating-point exponent.
3783 *----------------------------------------------------------------------------*/
3784 
3785 static float64
3786  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
3787                               float_status *status)
3788 {
3789     int8_t shiftCount;
3790 
3791     shiftCount = clz64(zSig) - 1;
3792     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3793                                status);
3794 
3795 }
3796 
3797 /*----------------------------------------------------------------------------
3798 | Normalizes the subnormal extended double-precision floating-point value
3799 | represented by the denormalized significand `aSig'.  The normalized exponent
3800 | and significand are stored at the locations pointed to by `zExpPtr' and
3801 | `zSigPtr', respectively.
3802 *----------------------------------------------------------------------------*/
3803 
3804 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3805                                 uint64_t *zSigPtr)
3806 {
3807     int8_t shiftCount;
3808 
3809     shiftCount = clz64(aSig);
3810     *zSigPtr = aSig<<shiftCount;
3811     *zExpPtr = 1 - shiftCount;
3812 }
3813 
3814 /*----------------------------------------------------------------------------
3815 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3816 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3817 | and returns the proper extended double-precision floating-point value
3818 | corresponding to the abstract input.  Ordinarily, the abstract value is
3819 | rounded and packed into the extended double-precision format, with the
3820 | inexact exception raised if the abstract input cannot be represented
3821 | exactly.  However, if the abstract value is too large, the overflow and
3822 | inexact exceptions are raised and an infinity or maximal finite value is
3823 | returned.  If the abstract value is too small, the input value is rounded to
3824 | a subnormal number, and the underflow and inexact exceptions are raised if
3825 | the abstract input cannot be represented exactly as a subnormal extended
3826 | double-precision floating-point number.
3827 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3828 | number of bits as single or double precision, respectively.  Otherwise, the
3829 | result is rounded to the full precision of the extended double-precision
3830 | format.
3831 |     The input significand must be normalized or smaller.  If the input
3832 | significand is not normalized, `zExp' must be 0; in that case, the result
3833 | returned is a subnormal number, and it must not require rounding.  The
3834 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3835 | Floating-Point Arithmetic.
3836 *----------------------------------------------------------------------------*/
3837 
3838 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
3839                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3840                               float_status *status)
3841 {
3842     int8_t roundingMode;
3843     bool roundNearestEven, increment, isTiny;
3844     int64_t roundIncrement, roundMask, roundBits;
3845 
3846     roundingMode = status->float_rounding_mode;
3847     roundNearestEven = ( roundingMode == float_round_nearest_even );
3848     if ( roundingPrecision == 80 ) goto precision80;
3849     if ( roundingPrecision == 64 ) {
3850         roundIncrement = UINT64_C(0x0000000000000400);
3851         roundMask = UINT64_C(0x00000000000007FF);
3852     }
3853     else if ( roundingPrecision == 32 ) {
3854         roundIncrement = UINT64_C(0x0000008000000000);
3855         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3856     }
3857     else {
3858         goto precision80;
3859     }
3860     zSig0 |= ( zSig1 != 0 );
3861     switch (roundingMode) {
3862     case float_round_nearest_even:
3863     case float_round_ties_away:
3864         break;
3865     case float_round_to_zero:
3866         roundIncrement = 0;
3867         break;
3868     case float_round_up:
3869         roundIncrement = zSign ? 0 : roundMask;
3870         break;
3871     case float_round_down:
3872         roundIncrement = zSign ? roundMask : 0;
3873         break;
3874     default:
3875         abort();
3876     }
3877     roundBits = zSig0 & roundMask;
3878     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3879         if (    ( 0x7FFE < zExp )
3880              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3881            ) {
3882             goto overflow;
3883         }
3884         if ( zExp <= 0 ) {
3885             if (status->flush_to_zero) {
3886                 float_raise(float_flag_output_denormal, status);
3887                 return packFloatx80(zSign, 0, 0);
3888             }
3889             isTiny = status->tininess_before_rounding
3890                   || (zExp < 0 )
3891                   || (zSig0 <= zSig0 + roundIncrement);
3892             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3893             zExp = 0;
3894             roundBits = zSig0 & roundMask;
3895             if (isTiny && roundBits) {
3896                 float_raise(float_flag_underflow, status);
3897             }
3898             if (roundBits) {
3899                 status->float_exception_flags |= float_flag_inexact;
3900             }
3901             zSig0 += roundIncrement;
3902             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3903             roundIncrement = roundMask + 1;
3904             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3905                 roundMask |= roundIncrement;
3906             }
3907             zSig0 &= ~ roundMask;
3908             return packFloatx80( zSign, zExp, zSig0 );
3909         }
3910     }
3911     if (roundBits) {
3912         status->float_exception_flags |= float_flag_inexact;
3913     }
3914     zSig0 += roundIncrement;
3915     if ( zSig0 < roundIncrement ) {
3916         ++zExp;
3917         zSig0 = UINT64_C(0x8000000000000000);
3918     }
3919     roundIncrement = roundMask + 1;
3920     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3921         roundMask |= roundIncrement;
3922     }
3923     zSig0 &= ~ roundMask;
3924     if ( zSig0 == 0 ) zExp = 0;
3925     return packFloatx80( zSign, zExp, zSig0 );
3926  precision80:
3927     switch (roundingMode) {
3928     case float_round_nearest_even:
3929     case float_round_ties_away:
3930         increment = ((int64_t)zSig1 < 0);
3931         break;
3932     case float_round_to_zero:
3933         increment = 0;
3934         break;
3935     case float_round_up:
3936         increment = !zSign && zSig1;
3937         break;
3938     case float_round_down:
3939         increment = zSign && zSig1;
3940         break;
3941     default:
3942         abort();
3943     }
3944     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3945         if (    ( 0x7FFE < zExp )
3946              || (    ( zExp == 0x7FFE )
3947                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3948                   && increment
3949                 )
3950            ) {
3951             roundMask = 0;
3952  overflow:
3953             float_raise(float_flag_overflow | float_flag_inexact, status);
3954             if (    ( roundingMode == float_round_to_zero )
3955                  || ( zSign && ( roundingMode == float_round_up ) )
3956                  || ( ! zSign && ( roundingMode == float_round_down ) )
3957                ) {
3958                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3959             }
3960             return packFloatx80(zSign,
3961                                 floatx80_infinity_high,
3962                                 floatx80_infinity_low);
3963         }
3964         if ( zExp <= 0 ) {
3965             isTiny = status->tininess_before_rounding
3966                   || (zExp < 0)
3967                   || !increment
3968                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
3969             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3970             zExp = 0;
3971             if (isTiny && zSig1) {
3972                 float_raise(float_flag_underflow, status);
3973             }
3974             if (zSig1) {
3975                 status->float_exception_flags |= float_flag_inexact;
3976             }
3977             switch (roundingMode) {
3978             case float_round_nearest_even:
3979             case float_round_ties_away:
3980                 increment = ((int64_t)zSig1 < 0);
3981                 break;
3982             case float_round_to_zero:
3983                 increment = 0;
3984                 break;
3985             case float_round_up:
3986                 increment = !zSign && zSig1;
3987                 break;
3988             case float_round_down:
3989                 increment = zSign && zSig1;
3990                 break;
3991             default:
3992                 abort();
3993             }
3994             if ( increment ) {
3995                 ++zSig0;
3996                 if (!(zSig1 << 1) && roundNearestEven) {
3997                     zSig0 &= ~1;
3998                 }
3999                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4000             }
4001             return packFloatx80( zSign, zExp, zSig0 );
4002         }
4003     }
4004     if (zSig1) {
4005         status->float_exception_flags |= float_flag_inexact;
4006     }
4007     if ( increment ) {
4008         ++zSig0;
4009         if ( zSig0 == 0 ) {
4010             ++zExp;
4011             zSig0 = UINT64_C(0x8000000000000000);
4012         }
4013         else {
4014             if (!(zSig1 << 1) && roundNearestEven) {
4015                 zSig0 &= ~1;
4016             }
4017         }
4018     }
4019     else {
4020         if ( zSig0 == 0 ) zExp = 0;
4021     }
4022     return packFloatx80( zSign, zExp, zSig0 );
4023 
4024 }
4025 
4026 /*----------------------------------------------------------------------------
4027 | Takes an abstract floating-point value having sign `zSign', exponent
4028 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4029 | and returns the proper extended double-precision floating-point value
4030 | corresponding to the abstract input.  This routine is just like
4031 | `roundAndPackFloatx80' except that the input significand does not have to be
4032 | normalized.
4033 *----------------------------------------------------------------------------*/
4034 
4035 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4036                                        bool zSign, int32_t zExp,
4037                                        uint64_t zSig0, uint64_t zSig1,
4038                                        float_status *status)
4039 {
4040     int8_t shiftCount;
4041 
4042     if ( zSig0 == 0 ) {
4043         zSig0 = zSig1;
4044         zSig1 = 0;
4045         zExp -= 64;
4046     }
4047     shiftCount = clz64(zSig0);
4048     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4049     zExp -= shiftCount;
4050     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4051                                 zSig0, zSig1, status);
4052 
4053 }
4054 
4055 /*----------------------------------------------------------------------------
4056 | Returns the least-significant 64 fraction bits of the quadruple-precision
4057 | floating-point value `a'.
4058 *----------------------------------------------------------------------------*/
4059 
4060 static inline uint64_t extractFloat128Frac1( float128 a )
4061 {
4062 
4063     return a.low;
4064 
4065 }
4066 
4067 /*----------------------------------------------------------------------------
4068 | Returns the most-significant 48 fraction bits of the quadruple-precision
4069 | floating-point value `a'.
4070 *----------------------------------------------------------------------------*/
4071 
4072 static inline uint64_t extractFloat128Frac0( float128 a )
4073 {
4074 
4075     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4076 
4077 }
4078 
4079 /*----------------------------------------------------------------------------
4080 | Returns the exponent bits of the quadruple-precision floating-point value
4081 | `a'.
4082 *----------------------------------------------------------------------------*/
4083 
4084 static inline int32_t extractFloat128Exp( float128 a )
4085 {
4086 
4087     return ( a.high>>48 ) & 0x7FFF;
4088 
4089 }
4090 
4091 /*----------------------------------------------------------------------------
4092 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4093 *----------------------------------------------------------------------------*/
4094 
4095 static inline bool extractFloat128Sign(float128 a)
4096 {
4097     return a.high >> 63;
4098 }
4099 
4100 /*----------------------------------------------------------------------------
4101 | Normalizes the subnormal quadruple-precision floating-point value
4102 | represented by the denormalized significand formed by the concatenation of
4103 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4104 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4105 | significand are stored at the location pointed to by `zSig0Ptr', and the
4106 | least significant 64 bits of the normalized significand are stored at the
4107 | location pointed to by `zSig1Ptr'.
4108 *----------------------------------------------------------------------------*/
4109 
4110 static void
4111  normalizeFloat128Subnormal(
4112      uint64_t aSig0,
4113      uint64_t aSig1,
4114      int32_t *zExpPtr,
4115      uint64_t *zSig0Ptr,
4116      uint64_t *zSig1Ptr
4117  )
4118 {
4119     int8_t shiftCount;
4120 
4121     if ( aSig0 == 0 ) {
4122         shiftCount = clz64(aSig1) - 15;
4123         if ( shiftCount < 0 ) {
4124             *zSig0Ptr = aSig1>>( - shiftCount );
4125             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4126         }
4127         else {
4128             *zSig0Ptr = aSig1<<shiftCount;
4129             *zSig1Ptr = 0;
4130         }
4131         *zExpPtr = - shiftCount - 63;
4132     }
4133     else {
4134         shiftCount = clz64(aSig0) - 15;
4135         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4136         *zExpPtr = 1 - shiftCount;
4137     }
4138 
4139 }
4140 
4141 /*----------------------------------------------------------------------------
4142 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4143 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4144 | floating-point value, returning the result.  After being shifted into the
4145 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4146 | added together to form the most significant 32 bits of the result.  This
4147 | means that any integer portion of `zSig0' will be added into the exponent.
4148 | Since a properly normalized significand will have an integer portion equal
4149 | to 1, the `zExp' input should be 1 less than the desired result exponent
4150 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4151 | significand.
4152 *----------------------------------------------------------------------------*/
4153 
4154 static inline float128
4155 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4156 {
4157     float128 z;
4158 
4159     z.low = zSig1;
4160     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4161     return z;
4162 }
4163 
4164 /*----------------------------------------------------------------------------
4165 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4166 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4167 | and `zSig2', and returns the proper quadruple-precision floating-point value
4168 | corresponding to the abstract input.  Ordinarily, the abstract value is
4169 | simply rounded and packed into the quadruple-precision format, with the
4170 | inexact exception raised if the abstract input cannot be represented
4171 | exactly.  However, if the abstract value is too large, the overflow and
4172 | inexact exceptions are raised and an infinity or maximal finite value is
4173 | returned.  If the abstract value is too small, the input value is rounded to
4174 | a subnormal number, and the underflow and inexact exceptions are raised if
4175 | the abstract input cannot be represented exactly as a subnormal quadruple-
4176 | precision floating-point number.
4177 |     The input significand must be normalized or smaller.  If the input
4178 | significand is not normalized, `zExp' must be 0; in that case, the result
4179 | returned is a subnormal number, and it must not require rounding.  In the
4180 | usual case that the input significand is normalized, `zExp' must be 1 less
4181 | than the ``true'' floating-point exponent.  The handling of underflow and
4182 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4183 *----------------------------------------------------------------------------*/
4184 
4185 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4186                                      uint64_t zSig0, uint64_t zSig1,
4187                                      uint64_t zSig2, float_status *status)
4188 {
4189     int8_t roundingMode;
4190     bool roundNearestEven, increment, isTiny;
4191 
4192     roundingMode = status->float_rounding_mode;
4193     roundNearestEven = ( roundingMode == float_round_nearest_even );
4194     switch (roundingMode) {
4195     case float_round_nearest_even:
4196     case float_round_ties_away:
4197         increment = ((int64_t)zSig2 < 0);
4198         break;
4199     case float_round_to_zero:
4200         increment = 0;
4201         break;
4202     case float_round_up:
4203         increment = !zSign && zSig2;
4204         break;
4205     case float_round_down:
4206         increment = zSign && zSig2;
4207         break;
4208     case float_round_to_odd:
4209         increment = !(zSig1 & 0x1) && zSig2;
4210         break;
4211     default:
4212         abort();
4213     }
4214     if ( 0x7FFD <= (uint32_t) zExp ) {
4215         if (    ( 0x7FFD < zExp )
4216              || (    ( zExp == 0x7FFD )
4217                   && eq128(
4218                          UINT64_C(0x0001FFFFFFFFFFFF),
4219                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4220                          zSig0,
4221                          zSig1
4222                      )
4223                   && increment
4224                 )
4225            ) {
4226             float_raise(float_flag_overflow | float_flag_inexact, status);
4227             if (    ( roundingMode == float_round_to_zero )
4228                  || ( zSign && ( roundingMode == float_round_up ) )
4229                  || ( ! zSign && ( roundingMode == float_round_down ) )
4230                  || (roundingMode == float_round_to_odd)
4231                ) {
4232                 return
4233                     packFloat128(
4234                         zSign,
4235                         0x7FFE,
4236                         UINT64_C(0x0000FFFFFFFFFFFF),
4237                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4238                     );
4239             }
4240             return packFloat128( zSign, 0x7FFF, 0, 0 );
4241         }
4242         if ( zExp < 0 ) {
4243             if (status->flush_to_zero) {
4244                 float_raise(float_flag_output_denormal, status);
4245                 return packFloat128(zSign, 0, 0, 0);
4246             }
4247             isTiny = status->tininess_before_rounding
4248                   || (zExp < -1)
4249                   || !increment
4250                   || lt128(zSig0, zSig1,
4251                            UINT64_C(0x0001FFFFFFFFFFFF),
4252                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4253             shift128ExtraRightJamming(
4254                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4255             zExp = 0;
4256             if (isTiny && zSig2) {
4257                 float_raise(float_flag_underflow, status);
4258             }
4259             switch (roundingMode) {
4260             case float_round_nearest_even:
4261             case float_round_ties_away:
4262                 increment = ((int64_t)zSig2 < 0);
4263                 break;
4264             case float_round_to_zero:
4265                 increment = 0;
4266                 break;
4267             case float_round_up:
4268                 increment = !zSign && zSig2;
4269                 break;
4270             case float_round_down:
4271                 increment = zSign && zSig2;
4272                 break;
4273             case float_round_to_odd:
4274                 increment = !(zSig1 & 0x1) && zSig2;
4275                 break;
4276             default:
4277                 abort();
4278             }
4279         }
4280     }
4281     if (zSig2) {
4282         status->float_exception_flags |= float_flag_inexact;
4283     }
4284     if ( increment ) {
4285         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4286         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4287             zSig1 &= ~1;
4288         }
4289     }
4290     else {
4291         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4292     }
4293     return packFloat128( zSign, zExp, zSig0, zSig1 );
4294 
4295 }
4296 
4297 /*----------------------------------------------------------------------------
4298 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4299 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4300 | returns the proper quadruple-precision floating-point value corresponding
4301 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4302 | except that the input significand has fewer bits and does not have to be
4303 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4304 | point exponent.
4305 *----------------------------------------------------------------------------*/
4306 
4307 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4308                                               uint64_t zSig0, uint64_t zSig1,
4309                                               float_status *status)
4310 {
4311     int8_t shiftCount;
4312     uint64_t zSig2;
4313 
4314     if ( zSig0 == 0 ) {
4315         zSig0 = zSig1;
4316         zSig1 = 0;
4317         zExp -= 64;
4318     }
4319     shiftCount = clz64(zSig0) - 15;
4320     if ( 0 <= shiftCount ) {
4321         zSig2 = 0;
4322         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4323     }
4324     else {
4325         shift128ExtraRightJamming(
4326             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4327     }
4328     zExp -= shiftCount;
4329     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4330 
4331 }
4332 
4333 
4334 /*----------------------------------------------------------------------------
4335 | Returns the result of converting the 32-bit two's complement integer `a'
4336 | to the extended double-precision floating-point format.  The conversion
4337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4338 | Arithmetic.
4339 *----------------------------------------------------------------------------*/
4340 
4341 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4342 {
4343     bool zSign;
4344     uint32_t absA;
4345     int8_t shiftCount;
4346     uint64_t zSig;
4347 
4348     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4349     zSign = ( a < 0 );
4350     absA = zSign ? - a : a;
4351     shiftCount = clz32(absA) + 32;
4352     zSig = absA;
4353     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4354 
4355 }
4356 
4357 /*----------------------------------------------------------------------------
4358 | Returns the result of converting the 32-bit two's complement integer `a' to
4359 | the quadruple-precision floating-point format.  The conversion is performed
4360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4361 *----------------------------------------------------------------------------*/
4362 
4363 float128 int32_to_float128(int32_t a, float_status *status)
4364 {
4365     bool zSign;
4366     uint32_t absA;
4367     int8_t shiftCount;
4368     uint64_t zSig0;
4369 
4370     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4371     zSign = ( a < 0 );
4372     absA = zSign ? - a : a;
4373     shiftCount = clz32(absA) + 17;
4374     zSig0 = absA;
4375     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4376 
4377 }
4378 
4379 /*----------------------------------------------------------------------------
4380 | Returns the result of converting the 64-bit two's complement integer `a'
4381 | to the extended double-precision floating-point format.  The conversion
4382 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4383 | Arithmetic.
4384 *----------------------------------------------------------------------------*/
4385 
4386 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4387 {
4388     bool zSign;
4389     uint64_t absA;
4390     int8_t shiftCount;
4391 
4392     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4393     zSign = ( a < 0 );
4394     absA = zSign ? - a : a;
4395     shiftCount = clz64(absA);
4396     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4397 
4398 }
4399 
4400 /*----------------------------------------------------------------------------
4401 | Returns the result of converting the 64-bit two's complement integer `a' to
4402 | the quadruple-precision floating-point format.  The conversion is performed
4403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4404 *----------------------------------------------------------------------------*/
4405 
4406 float128 int64_to_float128(int64_t a, float_status *status)
4407 {
4408     bool zSign;
4409     uint64_t absA;
4410     int8_t shiftCount;
4411     int32_t zExp;
4412     uint64_t zSig0, zSig1;
4413 
4414     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4415     zSign = ( a < 0 );
4416     absA = zSign ? - a : a;
4417     shiftCount = clz64(absA) + 49;
4418     zExp = 0x406E - shiftCount;
4419     if ( 64 <= shiftCount ) {
4420         zSig1 = 0;
4421         zSig0 = absA;
4422         shiftCount -= 64;
4423     }
4424     else {
4425         zSig1 = absA;
4426         zSig0 = 0;
4427     }
4428     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4429     return packFloat128( zSign, zExp, zSig0, zSig1 );
4430 
4431 }
4432 
4433 /*----------------------------------------------------------------------------
4434 | Returns the result of converting the 64-bit unsigned integer `a'
4435 | to the quadruple-precision floating-point format.  The conversion is performed
4436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4437 *----------------------------------------------------------------------------*/
4438 
4439 float128 uint64_to_float128(uint64_t a, float_status *status)
4440 {
4441     if (a == 0) {
4442         return float128_zero;
4443     }
4444     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4445 }
4446 
4447 /*----------------------------------------------------------------------------
4448 | Returns the result of converting the single-precision floating-point value
4449 | `a' to the extended double-precision floating-point format.  The conversion
4450 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4451 | Arithmetic.
4452 *----------------------------------------------------------------------------*/
4453 
4454 floatx80 float32_to_floatx80(float32 a, float_status *status)
4455 {
4456     bool aSign;
4457     int aExp;
4458     uint32_t aSig;
4459 
4460     a = float32_squash_input_denormal(a, status);
4461     aSig = extractFloat32Frac( a );
4462     aExp = extractFloat32Exp( a );
4463     aSign = extractFloat32Sign( a );
4464     if ( aExp == 0xFF ) {
4465         if (aSig) {
4466             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4467                                                status);
4468             return floatx80_silence_nan(res, status);
4469         }
4470         return packFloatx80(aSign,
4471                             floatx80_infinity_high,
4472                             floatx80_infinity_low);
4473     }
4474     if ( aExp == 0 ) {
4475         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4476         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4477     }
4478     aSig |= 0x00800000;
4479     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4480 
4481 }
4482 
4483 /*----------------------------------------------------------------------------
4484 | Returns the result of converting the single-precision floating-point value
4485 | `a' to the double-precision floating-point format.  The conversion is
4486 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4487 | Arithmetic.
4488 *----------------------------------------------------------------------------*/
4489 
4490 float128 float32_to_float128(float32 a, float_status *status)
4491 {
4492     bool aSign;
4493     int aExp;
4494     uint32_t aSig;
4495 
4496     a = float32_squash_input_denormal(a, status);
4497     aSig = extractFloat32Frac( a );
4498     aExp = extractFloat32Exp( a );
4499     aSign = extractFloat32Sign( a );
4500     if ( aExp == 0xFF ) {
4501         if (aSig) {
4502             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4503         }
4504         return packFloat128( aSign, 0x7FFF, 0, 0 );
4505     }
4506     if ( aExp == 0 ) {
4507         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4508         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4509         --aExp;
4510     }
4511     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4512 
4513 }
4514 
4515 /*----------------------------------------------------------------------------
4516 | Returns the remainder of the single-precision floating-point value `a'
4517 | with respect to the corresponding value `b'.  The operation is performed
4518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4519 *----------------------------------------------------------------------------*/
4520 
4521 float32 float32_rem(float32 a, float32 b, float_status *status)
4522 {
4523     bool aSign, zSign;
4524     int aExp, bExp, expDiff;
4525     uint32_t aSig, bSig;
4526     uint32_t q;
4527     uint64_t aSig64, bSig64, q64;
4528     uint32_t alternateASig;
4529     int32_t sigMean;
4530     a = float32_squash_input_denormal(a, status);
4531     b = float32_squash_input_denormal(b, status);
4532 
4533     aSig = extractFloat32Frac( a );
4534     aExp = extractFloat32Exp( a );
4535     aSign = extractFloat32Sign( a );
4536     bSig = extractFloat32Frac( b );
4537     bExp = extractFloat32Exp( b );
4538     if ( aExp == 0xFF ) {
4539         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4540             return propagateFloat32NaN(a, b, status);
4541         }
4542         float_raise(float_flag_invalid, status);
4543         return float32_default_nan(status);
4544     }
4545     if ( bExp == 0xFF ) {
4546         if (bSig) {
4547             return propagateFloat32NaN(a, b, status);
4548         }
4549         return a;
4550     }
4551     if ( bExp == 0 ) {
4552         if ( bSig == 0 ) {
4553             float_raise(float_flag_invalid, status);
4554             return float32_default_nan(status);
4555         }
4556         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4557     }
4558     if ( aExp == 0 ) {
4559         if ( aSig == 0 ) return a;
4560         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4561     }
4562     expDiff = aExp - bExp;
4563     aSig |= 0x00800000;
4564     bSig |= 0x00800000;
4565     if ( expDiff < 32 ) {
4566         aSig <<= 8;
4567         bSig <<= 8;
4568         if ( expDiff < 0 ) {
4569             if ( expDiff < -1 ) return a;
4570             aSig >>= 1;
4571         }
4572         q = ( bSig <= aSig );
4573         if ( q ) aSig -= bSig;
4574         if ( 0 < expDiff ) {
4575             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4576             q >>= 32 - expDiff;
4577             bSig >>= 2;
4578             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4579         }
4580         else {
4581             aSig >>= 2;
4582             bSig >>= 2;
4583         }
4584     }
4585     else {
4586         if ( bSig <= aSig ) aSig -= bSig;
4587         aSig64 = ( (uint64_t) aSig )<<40;
4588         bSig64 = ( (uint64_t) bSig )<<40;
4589         expDiff -= 64;
4590         while ( 0 < expDiff ) {
4591             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4592             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4593             aSig64 = - ( ( bSig * q64 )<<38 );
4594             expDiff -= 62;
4595         }
4596         expDiff += 64;
4597         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4598         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4599         q = q64>>( 64 - expDiff );
4600         bSig <<= 6;
4601         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4602     }
4603     do {
4604         alternateASig = aSig;
4605         ++q;
4606         aSig -= bSig;
4607     } while ( 0 <= (int32_t) aSig );
4608     sigMean = aSig + alternateASig;
4609     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4610         aSig = alternateASig;
4611     }
4612     zSign = ( (int32_t) aSig < 0 );
4613     if ( zSign ) aSig = - aSig;
4614     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4615 }
4616 
4617 
4618 
4619 /*----------------------------------------------------------------------------
4620 | Returns the binary exponential of the single-precision floating-point value
4621 | `a'. The operation is performed according to the IEC/IEEE Standard for
4622 | Binary Floating-Point Arithmetic.
4623 |
4624 | Uses the following identities:
4625 |
4626 | 1. -------------------------------------------------------------------------
4627 |      x    x*ln(2)
4628 |     2  = e
4629 |
4630 | 2. -------------------------------------------------------------------------
4631 |                      2     3     4     5           n
4632 |      x        x     x     x     x     x           x
4633 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4634 |               1!    2!    3!    4!    5!          n!
4635 *----------------------------------------------------------------------------*/
4636 
4637 static const float64 float32_exp2_coefficients[15] =
4638 {
4639     const_float64( 0x3ff0000000000000ll ), /*  1 */
4640     const_float64( 0x3fe0000000000000ll ), /*  2 */
4641     const_float64( 0x3fc5555555555555ll ), /*  3 */
4642     const_float64( 0x3fa5555555555555ll ), /*  4 */
4643     const_float64( 0x3f81111111111111ll ), /*  5 */
4644     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4645     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4646     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4647     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4648     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4649     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4650     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4651     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4652     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4653     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4654 };
4655 
4656 float32 float32_exp2(float32 a, float_status *status)
4657 {
4658     bool aSign;
4659     int aExp;
4660     uint32_t aSig;
4661     float64 r, x, xn;
4662     int i;
4663     a = float32_squash_input_denormal(a, status);
4664 
4665     aSig = extractFloat32Frac( a );
4666     aExp = extractFloat32Exp( a );
4667     aSign = extractFloat32Sign( a );
4668 
4669     if ( aExp == 0xFF) {
4670         if (aSig) {
4671             return propagateFloat32NaN(a, float32_zero, status);
4672         }
4673         return (aSign) ? float32_zero : a;
4674     }
4675     if (aExp == 0) {
4676         if (aSig == 0) return float32_one;
4677     }
4678 
4679     float_raise(float_flag_inexact, status);
4680 
4681     /* ******************************* */
4682     /* using float64 for approximation */
4683     /* ******************************* */
4684     x = float32_to_float64(a, status);
4685     x = float64_mul(x, float64_ln2, status);
4686 
4687     xn = x;
4688     r = float64_one;
4689     for (i = 0 ; i < 15 ; i++) {
4690         float64 f;
4691 
4692         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4693         r = float64_add(r, f, status);
4694 
4695         xn = float64_mul(xn, x, status);
4696     }
4697 
4698     return float64_to_float32(r, status);
4699 }
4700 
4701 /*----------------------------------------------------------------------------
4702 | Returns the binary log of the single-precision floating-point value `a'.
4703 | The operation is performed according to the IEC/IEEE Standard for Binary
4704 | Floating-Point Arithmetic.
4705 *----------------------------------------------------------------------------*/
4706 float32 float32_log2(float32 a, float_status *status)
4707 {
4708     bool aSign, zSign;
4709     int aExp;
4710     uint32_t aSig, zSig, i;
4711 
4712     a = float32_squash_input_denormal(a, status);
4713     aSig = extractFloat32Frac( a );
4714     aExp = extractFloat32Exp( a );
4715     aSign = extractFloat32Sign( a );
4716 
4717     if ( aExp == 0 ) {
4718         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4719         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4720     }
4721     if ( aSign ) {
4722         float_raise(float_flag_invalid, status);
4723         return float32_default_nan(status);
4724     }
4725     if ( aExp == 0xFF ) {
4726         if (aSig) {
4727             return propagateFloat32NaN(a, float32_zero, status);
4728         }
4729         return a;
4730     }
4731 
4732     aExp -= 0x7F;
4733     aSig |= 0x00800000;
4734     zSign = aExp < 0;
4735     zSig = aExp << 23;
4736 
4737     for (i = 1 << 22; i > 0; i >>= 1) {
4738         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4739         if ( aSig & 0x01000000 ) {
4740             aSig >>= 1;
4741             zSig |= i;
4742         }
4743     }
4744 
4745     if ( zSign )
4746         zSig = -zSig;
4747 
4748     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4749 }
4750 
4751 /*----------------------------------------------------------------------------
4752 | Returns the result of converting the double-precision floating-point value
4753 | `a' to the extended double-precision floating-point format.  The conversion
4754 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4755 | Arithmetic.
4756 *----------------------------------------------------------------------------*/
4757 
4758 floatx80 float64_to_floatx80(float64 a, float_status *status)
4759 {
4760     bool aSign;
4761     int aExp;
4762     uint64_t aSig;
4763 
4764     a = float64_squash_input_denormal(a, status);
4765     aSig = extractFloat64Frac( a );
4766     aExp = extractFloat64Exp( a );
4767     aSign = extractFloat64Sign( a );
4768     if ( aExp == 0x7FF ) {
4769         if (aSig) {
4770             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
4771                                                status);
4772             return floatx80_silence_nan(res, status);
4773         }
4774         return packFloatx80(aSign,
4775                             floatx80_infinity_high,
4776                             floatx80_infinity_low);
4777     }
4778     if ( aExp == 0 ) {
4779         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4780         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4781     }
4782     return
4783         packFloatx80(
4784             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
4785 
4786 }
4787 
4788 /*----------------------------------------------------------------------------
4789 | Returns the result of converting the double-precision floating-point value
4790 | `a' to the quadruple-precision floating-point format.  The conversion is
4791 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4792 | Arithmetic.
4793 *----------------------------------------------------------------------------*/
4794 
4795 float128 float64_to_float128(float64 a, float_status *status)
4796 {
4797     bool aSign;
4798     int aExp;
4799     uint64_t aSig, zSig0, zSig1;
4800 
4801     a = float64_squash_input_denormal(a, status);
4802     aSig = extractFloat64Frac( a );
4803     aExp = extractFloat64Exp( a );
4804     aSign = extractFloat64Sign( a );
4805     if ( aExp == 0x7FF ) {
4806         if (aSig) {
4807             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4808         }
4809         return packFloat128( aSign, 0x7FFF, 0, 0 );
4810     }
4811     if ( aExp == 0 ) {
4812         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4813         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4814         --aExp;
4815     }
4816     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4817     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4818 
4819 }
4820 
4821 
4822 /*----------------------------------------------------------------------------
4823 | Returns the remainder of the double-precision floating-point value `a'
4824 | with respect to the corresponding value `b'.  The operation is performed
4825 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4826 *----------------------------------------------------------------------------*/
4827 
4828 float64 float64_rem(float64 a, float64 b, float_status *status)
4829 {
4830     bool aSign, zSign;
4831     int aExp, bExp, expDiff;
4832     uint64_t aSig, bSig;
4833     uint64_t q, alternateASig;
4834     int64_t sigMean;
4835 
4836     a = float64_squash_input_denormal(a, status);
4837     b = float64_squash_input_denormal(b, status);
4838     aSig = extractFloat64Frac( a );
4839     aExp = extractFloat64Exp( a );
4840     aSign = extractFloat64Sign( a );
4841     bSig = extractFloat64Frac( b );
4842     bExp = extractFloat64Exp( b );
4843     if ( aExp == 0x7FF ) {
4844         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4845             return propagateFloat64NaN(a, b, status);
4846         }
4847         float_raise(float_flag_invalid, status);
4848         return float64_default_nan(status);
4849     }
4850     if ( bExp == 0x7FF ) {
4851         if (bSig) {
4852             return propagateFloat64NaN(a, b, status);
4853         }
4854         return a;
4855     }
4856     if ( bExp == 0 ) {
4857         if ( bSig == 0 ) {
4858             float_raise(float_flag_invalid, status);
4859             return float64_default_nan(status);
4860         }
4861         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4862     }
4863     if ( aExp == 0 ) {
4864         if ( aSig == 0 ) return a;
4865         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4866     }
4867     expDiff = aExp - bExp;
4868     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
4869     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
4870     if ( expDiff < 0 ) {
4871         if ( expDiff < -1 ) return a;
4872         aSig >>= 1;
4873     }
4874     q = ( bSig <= aSig );
4875     if ( q ) aSig -= bSig;
4876     expDiff -= 64;
4877     while ( 0 < expDiff ) {
4878         q = estimateDiv128To64( aSig, 0, bSig );
4879         q = ( 2 < q ) ? q - 2 : 0;
4880         aSig = - ( ( bSig>>2 ) * q );
4881         expDiff -= 62;
4882     }
4883     expDiff += 64;
4884     if ( 0 < expDiff ) {
4885         q = estimateDiv128To64( aSig, 0, bSig );
4886         q = ( 2 < q ) ? q - 2 : 0;
4887         q >>= 64 - expDiff;
4888         bSig >>= 2;
4889         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4890     }
4891     else {
4892         aSig >>= 2;
4893         bSig >>= 2;
4894     }
4895     do {
4896         alternateASig = aSig;
4897         ++q;
4898         aSig -= bSig;
4899     } while ( 0 <= (int64_t) aSig );
4900     sigMean = aSig + alternateASig;
4901     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4902         aSig = alternateASig;
4903     }
4904     zSign = ( (int64_t) aSig < 0 );
4905     if ( zSign ) aSig = - aSig;
4906     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4907 
4908 }
4909 
4910 /*----------------------------------------------------------------------------
4911 | Returns the binary log of the double-precision floating-point value `a'.
4912 | The operation is performed according to the IEC/IEEE Standard for Binary
4913 | Floating-Point Arithmetic.
4914 *----------------------------------------------------------------------------*/
4915 float64 float64_log2(float64 a, float_status *status)
4916 {
4917     bool aSign, zSign;
4918     int aExp;
4919     uint64_t aSig, aSig0, aSig1, zSig, i;
4920     a = float64_squash_input_denormal(a, status);
4921 
4922     aSig = extractFloat64Frac( a );
4923     aExp = extractFloat64Exp( a );
4924     aSign = extractFloat64Sign( a );
4925 
4926     if ( aExp == 0 ) {
4927         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4928         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4929     }
4930     if ( aSign ) {
4931         float_raise(float_flag_invalid, status);
4932         return float64_default_nan(status);
4933     }
4934     if ( aExp == 0x7FF ) {
4935         if (aSig) {
4936             return propagateFloat64NaN(a, float64_zero, status);
4937         }
4938         return a;
4939     }
4940 
4941     aExp -= 0x3FF;
4942     aSig |= UINT64_C(0x0010000000000000);
4943     zSign = aExp < 0;
4944     zSig = (uint64_t)aExp << 52;
4945     for (i = 1LL << 51; i > 0; i >>= 1) {
4946         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4947         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4948         if ( aSig & UINT64_C(0x0020000000000000) ) {
4949             aSig >>= 1;
4950             zSig |= i;
4951         }
4952     }
4953 
4954     if ( zSign )
4955         zSig = -zSig;
4956     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4957 }
4958 
4959 /*----------------------------------------------------------------------------
4960 | Returns the result of converting the extended double-precision floating-
4961 | point value `a' to the 32-bit two's complement integer format.  The
4962 | conversion is performed according to the IEC/IEEE Standard for Binary
4963 | Floating-Point Arithmetic---which means in particular that the conversion
4964 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4965 | largest positive integer is returned.  Otherwise, if the conversion
4966 | overflows, the largest integer with the same sign as `a' is returned.
4967 *----------------------------------------------------------------------------*/
4968 
4969 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4970 {
4971     bool aSign;
4972     int32_t aExp, shiftCount;
4973     uint64_t aSig;
4974 
4975     if (floatx80_invalid_encoding(a)) {
4976         float_raise(float_flag_invalid, status);
4977         return 1 << 31;
4978     }
4979     aSig = extractFloatx80Frac( a );
4980     aExp = extractFloatx80Exp( a );
4981     aSign = extractFloatx80Sign( a );
4982     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4983     shiftCount = 0x4037 - aExp;
4984     if ( shiftCount <= 0 ) shiftCount = 1;
4985     shift64RightJamming( aSig, shiftCount, &aSig );
4986     return roundAndPackInt32(aSign, aSig, status);
4987 
4988 }
4989 
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of converting the extended double-precision floating-
4992 | point value `a' to the 32-bit two's complement integer format.  The
4993 | conversion is performed according to the IEC/IEEE Standard for Binary
4994 | Floating-Point Arithmetic, except that the conversion is always rounded
4995 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4996 | Otherwise, if the conversion overflows, the largest integer with the same
4997 | sign as `a' is returned.
4998 *----------------------------------------------------------------------------*/
4999 
5000 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5001 {
5002     bool aSign;
5003     int32_t aExp, shiftCount;
5004     uint64_t aSig, savedASig;
5005     int32_t z;
5006 
5007     if (floatx80_invalid_encoding(a)) {
5008         float_raise(float_flag_invalid, status);
5009         return 1 << 31;
5010     }
5011     aSig = extractFloatx80Frac( a );
5012     aExp = extractFloatx80Exp( a );
5013     aSign = extractFloatx80Sign( a );
5014     if ( 0x401E < aExp ) {
5015         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5016         goto invalid;
5017     }
5018     else if ( aExp < 0x3FFF ) {
5019         if (aExp || aSig) {
5020             status->float_exception_flags |= float_flag_inexact;
5021         }
5022         return 0;
5023     }
5024     shiftCount = 0x403E - aExp;
5025     savedASig = aSig;
5026     aSig >>= shiftCount;
5027     z = aSig;
5028     if ( aSign ) z = - z;
5029     if ( ( z < 0 ) ^ aSign ) {
5030  invalid:
5031         float_raise(float_flag_invalid, status);
5032         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5033     }
5034     if ( ( aSig<<shiftCount ) != savedASig ) {
5035         status->float_exception_flags |= float_flag_inexact;
5036     }
5037     return z;
5038 
5039 }
5040 
5041 /*----------------------------------------------------------------------------
5042 | Returns the result of converting the extended double-precision floating-
5043 | point value `a' to the 64-bit two's complement integer format.  The
5044 | conversion is performed according to the IEC/IEEE Standard for Binary
5045 | Floating-Point Arithmetic---which means in particular that the conversion
5046 | is rounded according to the current rounding mode.  If `a' is a NaN,
5047 | the largest positive integer is returned.  Otherwise, if the conversion
5048 | overflows, the largest integer with the same sign as `a' is returned.
5049 *----------------------------------------------------------------------------*/
5050 
5051 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5052 {
5053     bool aSign;
5054     int32_t aExp, shiftCount;
5055     uint64_t aSig, aSigExtra;
5056 
5057     if (floatx80_invalid_encoding(a)) {
5058         float_raise(float_flag_invalid, status);
5059         return 1ULL << 63;
5060     }
5061     aSig = extractFloatx80Frac( a );
5062     aExp = extractFloatx80Exp( a );
5063     aSign = extractFloatx80Sign( a );
5064     shiftCount = 0x403E - aExp;
5065     if ( shiftCount <= 0 ) {
5066         if ( shiftCount ) {
5067             float_raise(float_flag_invalid, status);
5068             if (!aSign || floatx80_is_any_nan(a)) {
5069                 return INT64_MAX;
5070             }
5071             return INT64_MIN;
5072         }
5073         aSigExtra = 0;
5074     }
5075     else {
5076         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5077     }
5078     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5079 
5080 }
5081 
5082 /*----------------------------------------------------------------------------
5083 | Returns the result of converting the extended double-precision floating-
5084 | point value `a' to the 64-bit two's complement integer format.  The
5085 | conversion is performed according to the IEC/IEEE Standard for Binary
5086 | Floating-Point Arithmetic, except that the conversion is always rounded
5087 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5088 | Otherwise, if the conversion overflows, the largest integer with the same
5089 | sign as `a' is returned.
5090 *----------------------------------------------------------------------------*/
5091 
5092 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5093 {
5094     bool aSign;
5095     int32_t aExp, shiftCount;
5096     uint64_t aSig;
5097     int64_t z;
5098 
5099     if (floatx80_invalid_encoding(a)) {
5100         float_raise(float_flag_invalid, status);
5101         return 1ULL << 63;
5102     }
5103     aSig = extractFloatx80Frac( a );
5104     aExp = extractFloatx80Exp( a );
5105     aSign = extractFloatx80Sign( a );
5106     shiftCount = aExp - 0x403E;
5107     if ( 0 <= shiftCount ) {
5108         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5109         if ( ( a.high != 0xC03E ) || aSig ) {
5110             float_raise(float_flag_invalid, status);
5111             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5112                 return INT64_MAX;
5113             }
5114         }
5115         return INT64_MIN;
5116     }
5117     else if ( aExp < 0x3FFF ) {
5118         if (aExp | aSig) {
5119             status->float_exception_flags |= float_flag_inexact;
5120         }
5121         return 0;
5122     }
5123     z = aSig>>( - shiftCount );
5124     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5125         status->float_exception_flags |= float_flag_inexact;
5126     }
5127     if ( aSign ) z = - z;
5128     return z;
5129 
5130 }
5131 
5132 /*----------------------------------------------------------------------------
5133 | Returns the result of converting the extended double-precision floating-
5134 | point value `a' to the single-precision floating-point format.  The
5135 | conversion is performed according to the IEC/IEEE Standard for Binary
5136 | Floating-Point Arithmetic.
5137 *----------------------------------------------------------------------------*/
5138 
5139 float32 floatx80_to_float32(floatx80 a, float_status *status)
5140 {
5141     bool aSign;
5142     int32_t aExp;
5143     uint64_t aSig;
5144 
5145     if (floatx80_invalid_encoding(a)) {
5146         float_raise(float_flag_invalid, status);
5147         return float32_default_nan(status);
5148     }
5149     aSig = extractFloatx80Frac( a );
5150     aExp = extractFloatx80Exp( a );
5151     aSign = extractFloatx80Sign( a );
5152     if ( aExp == 0x7FFF ) {
5153         if ( (uint64_t) ( aSig<<1 ) ) {
5154             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5155                                              status);
5156             return float32_silence_nan(res, status);
5157         }
5158         return packFloat32( aSign, 0xFF, 0 );
5159     }
5160     shift64RightJamming( aSig, 33, &aSig );
5161     if ( aExp || aSig ) aExp -= 0x3F81;
5162     return roundAndPackFloat32(aSign, aExp, aSig, status);
5163 
5164 }
5165 
5166 /*----------------------------------------------------------------------------
5167 | Returns the result of converting the extended double-precision floating-
5168 | point value `a' to the double-precision floating-point format.  The
5169 | conversion is performed according to the IEC/IEEE Standard for Binary
5170 | Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172 
5173 float64 floatx80_to_float64(floatx80 a, float_status *status)
5174 {
5175     bool aSign;
5176     int32_t aExp;
5177     uint64_t aSig, zSig;
5178 
5179     if (floatx80_invalid_encoding(a)) {
5180         float_raise(float_flag_invalid, status);
5181         return float64_default_nan(status);
5182     }
5183     aSig = extractFloatx80Frac( a );
5184     aExp = extractFloatx80Exp( a );
5185     aSign = extractFloatx80Sign( a );
5186     if ( aExp == 0x7FFF ) {
5187         if ( (uint64_t) ( aSig<<1 ) ) {
5188             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5189                                              status);
5190             return float64_silence_nan(res, status);
5191         }
5192         return packFloat64( aSign, 0x7FF, 0 );
5193     }
5194     shift64RightJamming( aSig, 1, &zSig );
5195     if ( aExp || aSig ) aExp -= 0x3C01;
5196     return roundAndPackFloat64(aSign, aExp, zSig, status);
5197 
5198 }
5199 
5200 /*----------------------------------------------------------------------------
5201 | Returns the result of converting the extended double-precision floating-
5202 | point value `a' to the quadruple-precision floating-point format.  The
5203 | conversion is performed according to the IEC/IEEE Standard for Binary
5204 | Floating-Point Arithmetic.
5205 *----------------------------------------------------------------------------*/
5206 
5207 float128 floatx80_to_float128(floatx80 a, float_status *status)
5208 {
5209     bool aSign;
5210     int aExp;
5211     uint64_t aSig, zSig0, zSig1;
5212 
5213     if (floatx80_invalid_encoding(a)) {
5214         float_raise(float_flag_invalid, status);
5215         return float128_default_nan(status);
5216     }
5217     aSig = extractFloatx80Frac( a );
5218     aExp = extractFloatx80Exp( a );
5219     aSign = extractFloatx80Sign( a );
5220     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5221         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5222                                            status);
5223         return float128_silence_nan(res, status);
5224     }
5225     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5226     return packFloat128( aSign, aExp, zSig0, zSig1 );
5227 
5228 }
5229 
5230 /*----------------------------------------------------------------------------
5231 | Rounds the extended double-precision floating-point value `a'
5232 | to the precision provided by floatx80_rounding_precision and returns the
5233 | result as an extended double-precision floating-point value.
5234 | The operation is performed according to the IEC/IEEE Standard for Binary
5235 | Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 floatx80 floatx80_round(floatx80 a, float_status *status)
5239 {
5240     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5241                                 extractFloatx80Sign(a),
5242                                 extractFloatx80Exp(a),
5243                                 extractFloatx80Frac(a), 0, status);
5244 }
5245 
5246 /*----------------------------------------------------------------------------
5247 | Rounds the extended double-precision floating-point value `a' to an integer,
5248 | and returns the result as an extended quadruple-precision floating-point
5249 | value.  The operation is performed according to the IEC/IEEE Standard for
5250 | Binary Floating-Point Arithmetic.
5251 *----------------------------------------------------------------------------*/
5252 
5253 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5254 {
5255     bool aSign;
5256     int32_t aExp;
5257     uint64_t lastBitMask, roundBitsMask;
5258     floatx80 z;
5259 
5260     if (floatx80_invalid_encoding(a)) {
5261         float_raise(float_flag_invalid, status);
5262         return floatx80_default_nan(status);
5263     }
5264     aExp = extractFloatx80Exp( a );
5265     if ( 0x403E <= aExp ) {
5266         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5267             return propagateFloatx80NaN(a, a, status);
5268         }
5269         return a;
5270     }
5271     if ( aExp < 0x3FFF ) {
5272         if (    ( aExp == 0 )
5273              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5274             return a;
5275         }
5276         status->float_exception_flags |= float_flag_inexact;
5277         aSign = extractFloatx80Sign( a );
5278         switch (status->float_rounding_mode) {
5279          case float_round_nearest_even:
5280             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5281                ) {
5282                 return
5283                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5284             }
5285             break;
5286         case float_round_ties_away:
5287             if (aExp == 0x3FFE) {
5288                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5289             }
5290             break;
5291          case float_round_down:
5292             return
5293                   aSign ?
5294                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5295                 : packFloatx80( 0, 0, 0 );
5296          case float_round_up:
5297             return
5298                   aSign ? packFloatx80( 1, 0, 0 )
5299                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5300 
5301         case float_round_to_zero:
5302             break;
5303         default:
5304             g_assert_not_reached();
5305         }
5306         return packFloatx80( aSign, 0, 0 );
5307     }
5308     lastBitMask = 1;
5309     lastBitMask <<= 0x403E - aExp;
5310     roundBitsMask = lastBitMask - 1;
5311     z = a;
5312     switch (status->float_rounding_mode) {
5313     case float_round_nearest_even:
5314         z.low += lastBitMask>>1;
5315         if ((z.low & roundBitsMask) == 0) {
5316             z.low &= ~lastBitMask;
5317         }
5318         break;
5319     case float_round_ties_away:
5320         z.low += lastBitMask >> 1;
5321         break;
5322     case float_round_to_zero:
5323         break;
5324     case float_round_up:
5325         if (!extractFloatx80Sign(z)) {
5326             z.low += roundBitsMask;
5327         }
5328         break;
5329     case float_round_down:
5330         if (extractFloatx80Sign(z)) {
5331             z.low += roundBitsMask;
5332         }
5333         break;
5334     default:
5335         abort();
5336     }
5337     z.low &= ~ roundBitsMask;
5338     if ( z.low == 0 ) {
5339         ++z.high;
5340         z.low = UINT64_C(0x8000000000000000);
5341     }
5342     if (z.low != a.low) {
5343         status->float_exception_flags |= float_flag_inexact;
5344     }
5345     return z;
5346 
5347 }
5348 
5349 /*----------------------------------------------------------------------------
5350 | Returns the result of adding the absolute values of the extended double-
5351 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5352 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5353 | The addition is performed according to the IEC/IEEE Standard for Binary
5354 | Floating-Point Arithmetic.
5355 *----------------------------------------------------------------------------*/
5356 
5357 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5358                                 float_status *status)
5359 {
5360     int32_t aExp, bExp, zExp;
5361     uint64_t aSig, bSig, zSig0, zSig1;
5362     int32_t expDiff;
5363 
5364     aSig = extractFloatx80Frac( a );
5365     aExp = extractFloatx80Exp( a );
5366     bSig = extractFloatx80Frac( b );
5367     bExp = extractFloatx80Exp( b );
5368     expDiff = aExp - bExp;
5369     if ( 0 < expDiff ) {
5370         if ( aExp == 0x7FFF ) {
5371             if ((uint64_t)(aSig << 1)) {
5372                 return propagateFloatx80NaN(a, b, status);
5373             }
5374             return a;
5375         }
5376         if ( bExp == 0 ) --expDiff;
5377         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5378         zExp = aExp;
5379     }
5380     else if ( expDiff < 0 ) {
5381         if ( bExp == 0x7FFF ) {
5382             if ((uint64_t)(bSig << 1)) {
5383                 return propagateFloatx80NaN(a, b, status);
5384             }
5385             return packFloatx80(zSign,
5386                                 floatx80_infinity_high,
5387                                 floatx80_infinity_low);
5388         }
5389         if ( aExp == 0 ) ++expDiff;
5390         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5391         zExp = bExp;
5392     }
5393     else {
5394         if ( aExp == 0x7FFF ) {
5395             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5396                 return propagateFloatx80NaN(a, b, status);
5397             }
5398             return a;
5399         }
5400         zSig1 = 0;
5401         zSig0 = aSig + bSig;
5402         if ( aExp == 0 ) {
5403             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5404                 /* At least one of the values is a pseudo-denormal,
5405                  * and there is a carry out of the result.  */
5406                 zExp = 1;
5407                 goto shiftRight1;
5408             }
5409             if (zSig0 == 0) {
5410                 return packFloatx80(zSign, 0, 0);
5411             }
5412             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5413             goto roundAndPack;
5414         }
5415         zExp = aExp;
5416         goto shiftRight1;
5417     }
5418     zSig0 = aSig + bSig;
5419     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5420  shiftRight1:
5421     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5422     zSig0 |= UINT64_C(0x8000000000000000);
5423     ++zExp;
5424  roundAndPack:
5425     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5426                                 zSign, zExp, zSig0, zSig1, status);
5427 }
5428 
5429 /*----------------------------------------------------------------------------
5430 | Returns the result of subtracting the absolute values of the extended
5431 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5432 | difference is negated before being returned.  `zSign' is ignored if the
5433 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5434 | Standard for Binary Floating-Point Arithmetic.
5435 *----------------------------------------------------------------------------*/
5436 
5437 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5438                                 float_status *status)
5439 {
5440     int32_t aExp, bExp, zExp;
5441     uint64_t aSig, bSig, zSig0, zSig1;
5442     int32_t expDiff;
5443 
5444     aSig = extractFloatx80Frac( a );
5445     aExp = extractFloatx80Exp( a );
5446     bSig = extractFloatx80Frac( b );
5447     bExp = extractFloatx80Exp( b );
5448     expDiff = aExp - bExp;
5449     if ( 0 < expDiff ) goto aExpBigger;
5450     if ( expDiff < 0 ) goto bExpBigger;
5451     if ( aExp == 0x7FFF ) {
5452         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5453             return propagateFloatx80NaN(a, b, status);
5454         }
5455         float_raise(float_flag_invalid, status);
5456         return floatx80_default_nan(status);
5457     }
5458     if ( aExp == 0 ) {
5459         aExp = 1;
5460         bExp = 1;
5461     }
5462     zSig1 = 0;
5463     if ( bSig < aSig ) goto aBigger;
5464     if ( aSig < bSig ) goto bBigger;
5465     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5466  bExpBigger:
5467     if ( bExp == 0x7FFF ) {
5468         if ((uint64_t)(bSig << 1)) {
5469             return propagateFloatx80NaN(a, b, status);
5470         }
5471         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5472                             floatx80_infinity_low);
5473     }
5474     if ( aExp == 0 ) ++expDiff;
5475     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5476  bBigger:
5477     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5478     zExp = bExp;
5479     zSign ^= 1;
5480     goto normalizeRoundAndPack;
5481  aExpBigger:
5482     if ( aExp == 0x7FFF ) {
5483         if ((uint64_t)(aSig << 1)) {
5484             return propagateFloatx80NaN(a, b, status);
5485         }
5486         return a;
5487     }
5488     if ( bExp == 0 ) --expDiff;
5489     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5490  aBigger:
5491     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5492     zExp = aExp;
5493  normalizeRoundAndPack:
5494     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5495                                          zSign, zExp, zSig0, zSig1, status);
5496 }
5497 
5498 /*----------------------------------------------------------------------------
5499 | Returns the result of adding the extended double-precision floating-point
5500 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5501 | Standard for Binary Floating-Point Arithmetic.
5502 *----------------------------------------------------------------------------*/
5503 
5504 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5505 {
5506     bool aSign, bSign;
5507 
5508     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5509         float_raise(float_flag_invalid, status);
5510         return floatx80_default_nan(status);
5511     }
5512     aSign = extractFloatx80Sign( a );
5513     bSign = extractFloatx80Sign( b );
5514     if ( aSign == bSign ) {
5515         return addFloatx80Sigs(a, b, aSign, status);
5516     }
5517     else {
5518         return subFloatx80Sigs(a, b, aSign, status);
5519     }
5520 
5521 }
5522 
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of subtracting the extended double-precision floating-
5525 | point values `a' and `b'.  The operation is performed according to the
5526 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528 
5529 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5530 {
5531     bool aSign, bSign;
5532 
5533     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5534         float_raise(float_flag_invalid, status);
5535         return floatx80_default_nan(status);
5536     }
5537     aSign = extractFloatx80Sign( a );
5538     bSign = extractFloatx80Sign( b );
5539     if ( aSign == bSign ) {
5540         return subFloatx80Sigs(a, b, aSign, status);
5541     }
5542     else {
5543         return addFloatx80Sigs(a, b, aSign, status);
5544     }
5545 
5546 }
5547 
5548 /*----------------------------------------------------------------------------
5549 | Returns the result of multiplying the extended double-precision floating-
5550 | point values `a' and `b'.  The operation is performed according to the
5551 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553 
5554 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5555 {
5556     bool aSign, bSign, zSign;
5557     int32_t aExp, bExp, zExp;
5558     uint64_t aSig, bSig, zSig0, zSig1;
5559 
5560     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561         float_raise(float_flag_invalid, status);
5562         return floatx80_default_nan(status);
5563     }
5564     aSig = extractFloatx80Frac( a );
5565     aExp = extractFloatx80Exp( a );
5566     aSign = extractFloatx80Sign( a );
5567     bSig = extractFloatx80Frac( b );
5568     bExp = extractFloatx80Exp( b );
5569     bSign = extractFloatx80Sign( b );
5570     zSign = aSign ^ bSign;
5571     if ( aExp == 0x7FFF ) {
5572         if (    (uint64_t) ( aSig<<1 )
5573              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5574             return propagateFloatx80NaN(a, b, status);
5575         }
5576         if ( ( bExp | bSig ) == 0 ) goto invalid;
5577         return packFloatx80(zSign, floatx80_infinity_high,
5578                                    floatx80_infinity_low);
5579     }
5580     if ( bExp == 0x7FFF ) {
5581         if ((uint64_t)(bSig << 1)) {
5582             return propagateFloatx80NaN(a, b, status);
5583         }
5584         if ( ( aExp | aSig ) == 0 ) {
5585  invalid:
5586             float_raise(float_flag_invalid, status);
5587             return floatx80_default_nan(status);
5588         }
5589         return packFloatx80(zSign, floatx80_infinity_high,
5590                                    floatx80_infinity_low);
5591     }
5592     if ( aExp == 0 ) {
5593         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5594         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5595     }
5596     if ( bExp == 0 ) {
5597         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5598         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5599     }
5600     zExp = aExp + bExp - 0x3FFE;
5601     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5602     if ( 0 < (int64_t) zSig0 ) {
5603         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5604         --zExp;
5605     }
5606     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607                                 zSign, zExp, zSig0, zSig1, status);
5608 }
5609 
5610 /*----------------------------------------------------------------------------
5611 | Returns the result of dividing the extended double-precision floating-point
5612 | value `a' by the corresponding value `b'.  The operation is performed
5613 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5614 *----------------------------------------------------------------------------*/
5615 
5616 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5617 {
5618     bool aSign, bSign, zSign;
5619     int32_t aExp, bExp, zExp;
5620     uint64_t aSig, bSig, zSig0, zSig1;
5621     uint64_t rem0, rem1, rem2, term0, term1, term2;
5622 
5623     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5624         float_raise(float_flag_invalid, status);
5625         return floatx80_default_nan(status);
5626     }
5627     aSig = extractFloatx80Frac( a );
5628     aExp = extractFloatx80Exp( a );
5629     aSign = extractFloatx80Sign( a );
5630     bSig = extractFloatx80Frac( b );
5631     bExp = extractFloatx80Exp( b );
5632     bSign = extractFloatx80Sign( b );
5633     zSign = aSign ^ bSign;
5634     if ( aExp == 0x7FFF ) {
5635         if ((uint64_t)(aSig << 1)) {
5636             return propagateFloatx80NaN(a, b, status);
5637         }
5638         if ( bExp == 0x7FFF ) {
5639             if ((uint64_t)(bSig << 1)) {
5640                 return propagateFloatx80NaN(a, b, status);
5641             }
5642             goto invalid;
5643         }
5644         return packFloatx80(zSign, floatx80_infinity_high,
5645                                    floatx80_infinity_low);
5646     }
5647     if ( bExp == 0x7FFF ) {
5648         if ((uint64_t)(bSig << 1)) {
5649             return propagateFloatx80NaN(a, b, status);
5650         }
5651         return packFloatx80( zSign, 0, 0 );
5652     }
5653     if ( bExp == 0 ) {
5654         if ( bSig == 0 ) {
5655             if ( ( aExp | aSig ) == 0 ) {
5656  invalid:
5657                 float_raise(float_flag_invalid, status);
5658                 return floatx80_default_nan(status);
5659             }
5660             float_raise(float_flag_divbyzero, status);
5661             return packFloatx80(zSign, floatx80_infinity_high,
5662                                        floatx80_infinity_low);
5663         }
5664         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5665     }
5666     if ( aExp == 0 ) {
5667         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5668         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5669     }
5670     zExp = aExp - bExp + 0x3FFE;
5671     rem1 = 0;
5672     if ( bSig <= aSig ) {
5673         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5674         ++zExp;
5675     }
5676     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5677     mul64To128( bSig, zSig0, &term0, &term1 );
5678     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5679     while ( (int64_t) rem0 < 0 ) {
5680         --zSig0;
5681         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5682     }
5683     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5684     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5685         mul64To128( bSig, zSig1, &term1, &term2 );
5686         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5687         while ( (int64_t) rem1 < 0 ) {
5688             --zSig1;
5689             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5690         }
5691         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5692     }
5693     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5694                                 zSign, zExp, zSig0, zSig1, status);
5695 }
5696 
5697 /*----------------------------------------------------------------------------
5698 | Returns the remainder of the extended double-precision floating-point value
5699 | `a' with respect to the corresponding value `b'.  The operation is performed
5700 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5701 *----------------------------------------------------------------------------*/
5702 
5703 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5704 {
5705     bool aSign, zSign;
5706     int32_t aExp, bExp, expDiff;
5707     uint64_t aSig0, aSig1, bSig;
5708     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5709 
5710     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5711         float_raise(float_flag_invalid, status);
5712         return floatx80_default_nan(status);
5713     }
5714     aSig0 = extractFloatx80Frac( a );
5715     aExp = extractFloatx80Exp( a );
5716     aSign = extractFloatx80Sign( a );
5717     bSig = extractFloatx80Frac( b );
5718     bExp = extractFloatx80Exp( b );
5719     if ( aExp == 0x7FFF ) {
5720         if (    (uint64_t) ( aSig0<<1 )
5721              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5722             return propagateFloatx80NaN(a, b, status);
5723         }
5724         goto invalid;
5725     }
5726     if ( bExp == 0x7FFF ) {
5727         if ((uint64_t)(bSig << 1)) {
5728             return propagateFloatx80NaN(a, b, status);
5729         }
5730         return a;
5731     }
5732     if ( bExp == 0 ) {
5733         if ( bSig == 0 ) {
5734  invalid:
5735             float_raise(float_flag_invalid, status);
5736             return floatx80_default_nan(status);
5737         }
5738         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5739     }
5740     if ( aExp == 0 ) {
5741         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5742         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5743     }
5744     bSig |= UINT64_C(0x8000000000000000);
5745     zSign = aSign;
5746     expDiff = aExp - bExp;
5747     aSig1 = 0;
5748     if ( expDiff < 0 ) {
5749         if ( expDiff < -1 ) return a;
5750         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5751         expDiff = 0;
5752     }
5753     q = ( bSig <= aSig0 );
5754     if ( q ) aSig0 -= bSig;
5755     expDiff -= 64;
5756     while ( 0 < expDiff ) {
5757         q = estimateDiv128To64( aSig0, aSig1, bSig );
5758         q = ( 2 < q ) ? q - 2 : 0;
5759         mul64To128( bSig, q, &term0, &term1 );
5760         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5761         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5762         expDiff -= 62;
5763     }
5764     expDiff += 64;
5765     if ( 0 < expDiff ) {
5766         q = estimateDiv128To64( aSig0, aSig1, bSig );
5767         q = ( 2 < q ) ? q - 2 : 0;
5768         q >>= 64 - expDiff;
5769         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5770         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5771         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5772         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5773             ++q;
5774             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5775         }
5776     }
5777     else {
5778         term1 = 0;
5779         term0 = bSig;
5780     }
5781     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5782     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5783          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5784               && ( q & 1 ) )
5785        ) {
5786         aSig0 = alternateASig0;
5787         aSig1 = alternateASig1;
5788         zSign = ! zSign;
5789     }
5790     return
5791         normalizeRoundAndPackFloatx80(
5792             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5793 
5794 }
5795 
5796 /*----------------------------------------------------------------------------
5797 | Returns the square root of the extended double-precision floating-point
5798 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5799 | for Binary Floating-Point Arithmetic.
5800 *----------------------------------------------------------------------------*/
5801 
5802 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5803 {
5804     bool aSign;
5805     int32_t aExp, zExp;
5806     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5807     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5808 
5809     if (floatx80_invalid_encoding(a)) {
5810         float_raise(float_flag_invalid, status);
5811         return floatx80_default_nan(status);
5812     }
5813     aSig0 = extractFloatx80Frac( a );
5814     aExp = extractFloatx80Exp( a );
5815     aSign = extractFloatx80Sign( a );
5816     if ( aExp == 0x7FFF ) {
5817         if ((uint64_t)(aSig0 << 1)) {
5818             return propagateFloatx80NaN(a, a, status);
5819         }
5820         if ( ! aSign ) return a;
5821         goto invalid;
5822     }
5823     if ( aSign ) {
5824         if ( ( aExp | aSig0 ) == 0 ) return a;
5825  invalid:
5826         float_raise(float_flag_invalid, status);
5827         return floatx80_default_nan(status);
5828     }
5829     if ( aExp == 0 ) {
5830         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5831         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5832     }
5833     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5834     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5835     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5836     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5837     doubleZSig0 = zSig0<<1;
5838     mul64To128( zSig0, zSig0, &term0, &term1 );
5839     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5840     while ( (int64_t) rem0 < 0 ) {
5841         --zSig0;
5842         doubleZSig0 -= 2;
5843         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5844     }
5845     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5846     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
5847         if ( zSig1 == 0 ) zSig1 = 1;
5848         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5849         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5850         mul64To128( zSig1, zSig1, &term2, &term3 );
5851         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5852         while ( (int64_t) rem1 < 0 ) {
5853             --zSig1;
5854             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5855             term3 |= 1;
5856             term2 |= doubleZSig0;
5857             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5858         }
5859         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5860     }
5861     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5862     zSig0 |= doubleZSig0;
5863     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5864                                 0, zExp, zSig0, zSig1, status);
5865 }
5866 
5867 /*----------------------------------------------------------------------------
5868 | Returns the result of converting the quadruple-precision floating-point
5869 | value `a' to the 32-bit two's complement integer format.  The conversion
5870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5871 | Arithmetic---which means in particular that the conversion is rounded
5872 | according to the current rounding mode.  If `a' is a NaN, the largest
5873 | positive integer is returned.  Otherwise, if the conversion overflows, the
5874 | largest integer with the same sign as `a' is returned.
5875 *----------------------------------------------------------------------------*/
5876 
5877 int32_t float128_to_int32(float128 a, float_status *status)
5878 {
5879     bool aSign;
5880     int32_t aExp, shiftCount;
5881     uint64_t aSig0, aSig1;
5882 
5883     aSig1 = extractFloat128Frac1( a );
5884     aSig0 = extractFloat128Frac0( a );
5885     aExp = extractFloat128Exp( a );
5886     aSign = extractFloat128Sign( a );
5887     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5888     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5889     aSig0 |= ( aSig1 != 0 );
5890     shiftCount = 0x4028 - aExp;
5891     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5892     return roundAndPackInt32(aSign, aSig0, status);
5893 
5894 }
5895 
5896 /*----------------------------------------------------------------------------
5897 | Returns the result of converting the quadruple-precision floating-point
5898 | value `a' to the 32-bit two's complement integer format.  The conversion
5899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5900 | Arithmetic, except that the conversion is always rounded toward zero.  If
5901 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5902 | conversion overflows, the largest integer with the same sign as `a' is
5903 | returned.
5904 *----------------------------------------------------------------------------*/
5905 
5906 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5907 {
5908     bool aSign;
5909     int32_t aExp, shiftCount;
5910     uint64_t aSig0, aSig1, savedASig;
5911     int32_t z;
5912 
5913     aSig1 = extractFloat128Frac1( a );
5914     aSig0 = extractFloat128Frac0( a );
5915     aExp = extractFloat128Exp( a );
5916     aSign = extractFloat128Sign( a );
5917     aSig0 |= ( aSig1 != 0 );
5918     if ( 0x401E < aExp ) {
5919         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5920         goto invalid;
5921     }
5922     else if ( aExp < 0x3FFF ) {
5923         if (aExp || aSig0) {
5924             status->float_exception_flags |= float_flag_inexact;
5925         }
5926         return 0;
5927     }
5928     aSig0 |= UINT64_C(0x0001000000000000);
5929     shiftCount = 0x402F - aExp;
5930     savedASig = aSig0;
5931     aSig0 >>= shiftCount;
5932     z = aSig0;
5933     if ( aSign ) z = - z;
5934     if ( ( z < 0 ) ^ aSign ) {
5935  invalid:
5936         float_raise(float_flag_invalid, status);
5937         return aSign ? INT32_MIN : INT32_MAX;
5938     }
5939     if ( ( aSig0<<shiftCount ) != savedASig ) {
5940         status->float_exception_flags |= float_flag_inexact;
5941     }
5942     return z;
5943 
5944 }
5945 
5946 /*----------------------------------------------------------------------------
5947 | Returns the result of converting the quadruple-precision floating-point
5948 | value `a' to the 64-bit two's complement integer format.  The conversion
5949 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5950 | Arithmetic---which means in particular that the conversion is rounded
5951 | according to the current rounding mode.  If `a' is a NaN, the largest
5952 | positive integer is returned.  Otherwise, if the conversion overflows, the
5953 | largest integer with the same sign as `a' is returned.
5954 *----------------------------------------------------------------------------*/
5955 
5956 int64_t float128_to_int64(float128 a, float_status *status)
5957 {
5958     bool aSign;
5959     int32_t aExp, shiftCount;
5960     uint64_t aSig0, aSig1;
5961 
5962     aSig1 = extractFloat128Frac1( a );
5963     aSig0 = extractFloat128Frac0( a );
5964     aExp = extractFloat128Exp( a );
5965     aSign = extractFloat128Sign( a );
5966     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
5967     shiftCount = 0x402F - aExp;
5968     if ( shiftCount <= 0 ) {
5969         if ( 0x403E < aExp ) {
5970             float_raise(float_flag_invalid, status);
5971             if (    ! aSign
5972                  || (    ( aExp == 0x7FFF )
5973                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
5974                     )
5975                ) {
5976                 return INT64_MAX;
5977             }
5978             return INT64_MIN;
5979         }
5980         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5981     }
5982     else {
5983         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5984     }
5985     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5986 
5987 }
5988 
5989 /*----------------------------------------------------------------------------
5990 | Returns the result of converting the quadruple-precision floating-point
5991 | value `a' to the 64-bit two's complement integer format.  The conversion
5992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5993 | Arithmetic, except that the conversion is always rounded toward zero.
5994 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5995 | the conversion overflows, the largest integer with the same sign as `a' is
5996 | returned.
5997 *----------------------------------------------------------------------------*/
5998 
5999 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6000 {
6001     bool aSign;
6002     int32_t aExp, shiftCount;
6003     uint64_t aSig0, aSig1;
6004     int64_t z;
6005 
6006     aSig1 = extractFloat128Frac1( a );
6007     aSig0 = extractFloat128Frac0( a );
6008     aExp = extractFloat128Exp( a );
6009     aSign = extractFloat128Sign( a );
6010     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6011     shiftCount = aExp - 0x402F;
6012     if ( 0 < shiftCount ) {
6013         if ( 0x403E <= aExp ) {
6014             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6015             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6016                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6017                 if (aSig1) {
6018                     status->float_exception_flags |= float_flag_inexact;
6019                 }
6020             }
6021             else {
6022                 float_raise(float_flag_invalid, status);
6023                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6024                     return INT64_MAX;
6025                 }
6026             }
6027             return INT64_MIN;
6028         }
6029         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6030         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6031             status->float_exception_flags |= float_flag_inexact;
6032         }
6033     }
6034     else {
6035         if ( aExp < 0x3FFF ) {
6036             if ( aExp | aSig0 | aSig1 ) {
6037                 status->float_exception_flags |= float_flag_inexact;
6038             }
6039             return 0;
6040         }
6041         z = aSig0>>( - shiftCount );
6042         if (    aSig1
6043              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6044             status->float_exception_flags |= float_flag_inexact;
6045         }
6046     }
6047     if ( aSign ) z = - z;
6048     return z;
6049 
6050 }
6051 
6052 /*----------------------------------------------------------------------------
6053 | Returns the result of converting the quadruple-precision floating-point value
6054 | `a' to the 64-bit unsigned integer format.  The conversion is
6055 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6056 | Arithmetic---which means in particular that the conversion is rounded
6057 | according to the current rounding mode.  If `a' is a NaN, the largest
6058 | positive integer is returned.  If the conversion overflows, the
6059 | largest unsigned integer is returned.  If 'a' is negative, the value is
6060 | rounded and zero is returned; negative values that do not round to zero
6061 | will raise the inexact exception.
6062 *----------------------------------------------------------------------------*/
6063 
6064 uint64_t float128_to_uint64(float128 a, float_status *status)
6065 {
6066     bool aSign;
6067     int aExp;
6068     int shiftCount;
6069     uint64_t aSig0, aSig1;
6070 
6071     aSig0 = extractFloat128Frac0(a);
6072     aSig1 = extractFloat128Frac1(a);
6073     aExp = extractFloat128Exp(a);
6074     aSign = extractFloat128Sign(a);
6075     if (aSign && (aExp > 0x3FFE)) {
6076         float_raise(float_flag_invalid, status);
6077         if (float128_is_any_nan(a)) {
6078             return UINT64_MAX;
6079         } else {
6080             return 0;
6081         }
6082     }
6083     if (aExp) {
6084         aSig0 |= UINT64_C(0x0001000000000000);
6085     }
6086     shiftCount = 0x402F - aExp;
6087     if (shiftCount <= 0) {
6088         if (0x403E < aExp) {
6089             float_raise(float_flag_invalid, status);
6090             return UINT64_MAX;
6091         }
6092         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6093     } else {
6094         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6095     }
6096     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6097 }
6098 
6099 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6100 {
6101     uint64_t v;
6102     signed char current_rounding_mode = status->float_rounding_mode;
6103 
6104     set_float_rounding_mode(float_round_to_zero, status);
6105     v = float128_to_uint64(a, status);
6106     set_float_rounding_mode(current_rounding_mode, status);
6107 
6108     return v;
6109 }
6110 
6111 /*----------------------------------------------------------------------------
6112 | Returns the result of converting the quadruple-precision floating-point
6113 | value `a' to the 32-bit unsigned integer format.  The conversion
6114 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6115 | Arithmetic except that the conversion is always rounded toward zero.
6116 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6117 | if the conversion overflows, the largest unsigned integer is returned.
6118 | If 'a' is negative, the value is rounded and zero is returned; negative
6119 | values that do not round to zero will raise the inexact exception.
6120 *----------------------------------------------------------------------------*/
6121 
6122 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6123 {
6124     uint64_t v;
6125     uint32_t res;
6126     int old_exc_flags = get_float_exception_flags(status);
6127 
6128     v = float128_to_uint64_round_to_zero(a, status);
6129     if (v > 0xffffffff) {
6130         res = 0xffffffff;
6131     } else {
6132         return v;
6133     }
6134     set_float_exception_flags(old_exc_flags, status);
6135     float_raise(float_flag_invalid, status);
6136     return res;
6137 }
6138 
6139 /*----------------------------------------------------------------------------
6140 | Returns the result of converting the quadruple-precision floating-point value
6141 | `a' to the 32-bit unsigned integer format.  The conversion is
6142 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6143 | Arithmetic---which means in particular that the conversion is rounded
6144 | according to the current rounding mode.  If `a' is a NaN, the largest
6145 | positive integer is returned.  If the conversion overflows, the
6146 | largest unsigned integer is returned.  If 'a' is negative, the value is
6147 | rounded and zero is returned; negative values that do not round to zero
6148 | will raise the inexact exception.
6149 *----------------------------------------------------------------------------*/
6150 
6151 uint32_t float128_to_uint32(float128 a, float_status *status)
6152 {
6153     uint64_t v;
6154     uint32_t res;
6155     int old_exc_flags = get_float_exception_flags(status);
6156 
6157     v = float128_to_uint64(a, status);
6158     if (v > 0xffffffff) {
6159         res = 0xffffffff;
6160     } else {
6161         return v;
6162     }
6163     set_float_exception_flags(old_exc_flags, status);
6164     float_raise(float_flag_invalid, status);
6165     return res;
6166 }
6167 
6168 /*----------------------------------------------------------------------------
6169 | Returns the result of converting the quadruple-precision floating-point
6170 | value `a' to the single-precision floating-point format.  The conversion
6171 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6172 | Arithmetic.
6173 *----------------------------------------------------------------------------*/
6174 
6175 float32 float128_to_float32(float128 a, float_status *status)
6176 {
6177     bool aSign;
6178     int32_t aExp;
6179     uint64_t aSig0, aSig1;
6180     uint32_t zSig;
6181 
6182     aSig1 = extractFloat128Frac1( a );
6183     aSig0 = extractFloat128Frac0( a );
6184     aExp = extractFloat128Exp( a );
6185     aSign = extractFloat128Sign( a );
6186     if ( aExp == 0x7FFF ) {
6187         if ( aSig0 | aSig1 ) {
6188             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6189         }
6190         return packFloat32( aSign, 0xFF, 0 );
6191     }
6192     aSig0 |= ( aSig1 != 0 );
6193     shift64RightJamming( aSig0, 18, &aSig0 );
6194     zSig = aSig0;
6195     if ( aExp || zSig ) {
6196         zSig |= 0x40000000;
6197         aExp -= 0x3F81;
6198     }
6199     return roundAndPackFloat32(aSign, aExp, zSig, status);
6200 
6201 }
6202 
6203 /*----------------------------------------------------------------------------
6204 | Returns the result of converting the quadruple-precision floating-point
6205 | value `a' to the double-precision floating-point format.  The conversion
6206 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6207 | Arithmetic.
6208 *----------------------------------------------------------------------------*/
6209 
6210 float64 float128_to_float64(float128 a, float_status *status)
6211 {
6212     bool aSign;
6213     int32_t aExp;
6214     uint64_t aSig0, aSig1;
6215 
6216     aSig1 = extractFloat128Frac1( a );
6217     aSig0 = extractFloat128Frac0( a );
6218     aExp = extractFloat128Exp( a );
6219     aSign = extractFloat128Sign( a );
6220     if ( aExp == 0x7FFF ) {
6221         if ( aSig0 | aSig1 ) {
6222             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6223         }
6224         return packFloat64( aSign, 0x7FF, 0 );
6225     }
6226     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6227     aSig0 |= ( aSig1 != 0 );
6228     if ( aExp || aSig0 ) {
6229         aSig0 |= UINT64_C(0x4000000000000000);
6230         aExp -= 0x3C01;
6231     }
6232     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6233 
6234 }
6235 
6236 /*----------------------------------------------------------------------------
6237 | Returns the result of converting the quadruple-precision floating-point
6238 | value `a' to the extended double-precision floating-point format.  The
6239 | conversion is performed according to the IEC/IEEE Standard for Binary
6240 | Floating-Point Arithmetic.
6241 *----------------------------------------------------------------------------*/
6242 
6243 floatx80 float128_to_floatx80(float128 a, float_status *status)
6244 {
6245     bool aSign;
6246     int32_t aExp;
6247     uint64_t aSig0, aSig1;
6248 
6249     aSig1 = extractFloat128Frac1( a );
6250     aSig0 = extractFloat128Frac0( a );
6251     aExp = extractFloat128Exp( a );
6252     aSign = extractFloat128Sign( a );
6253     if ( aExp == 0x7FFF ) {
6254         if ( aSig0 | aSig1 ) {
6255             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6256                                                status);
6257             return floatx80_silence_nan(res, status);
6258         }
6259         return packFloatx80(aSign, floatx80_infinity_high,
6260                                    floatx80_infinity_low);
6261     }
6262     if ( aExp == 0 ) {
6263         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6264         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6265     }
6266     else {
6267         aSig0 |= UINT64_C(0x0001000000000000);
6268     }
6269     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6270     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6271 
6272 }
6273 
6274 /*----------------------------------------------------------------------------
6275 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6276 | returns the result as a quadruple-precision floating-point value.  The
6277 | operation is performed according to the IEC/IEEE Standard for Binary
6278 | Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6280 
6281 float128 float128_round_to_int(float128 a, float_status *status)
6282 {
6283     bool aSign;
6284     int32_t aExp;
6285     uint64_t lastBitMask, roundBitsMask;
6286     float128 z;
6287 
6288     aExp = extractFloat128Exp( a );
6289     if ( 0x402F <= aExp ) {
6290         if ( 0x406F <= aExp ) {
6291             if (    ( aExp == 0x7FFF )
6292                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6293                ) {
6294                 return propagateFloat128NaN(a, a, status);
6295             }
6296             return a;
6297         }
6298         lastBitMask = 1;
6299         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6300         roundBitsMask = lastBitMask - 1;
6301         z = a;
6302         switch (status->float_rounding_mode) {
6303         case float_round_nearest_even:
6304             if ( lastBitMask ) {
6305                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6306                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6307             }
6308             else {
6309                 if ( (int64_t) z.low < 0 ) {
6310                     ++z.high;
6311                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6312                 }
6313             }
6314             break;
6315         case float_round_ties_away:
6316             if (lastBitMask) {
6317                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6318             } else {
6319                 if ((int64_t) z.low < 0) {
6320                     ++z.high;
6321                 }
6322             }
6323             break;
6324         case float_round_to_zero:
6325             break;
6326         case float_round_up:
6327             if (!extractFloat128Sign(z)) {
6328                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6329             }
6330             break;
6331         case float_round_down:
6332             if (extractFloat128Sign(z)) {
6333                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6334             }
6335             break;
6336         case float_round_to_odd:
6337             /*
6338              * Note that if lastBitMask == 0, the last bit is the lsb
6339              * of high, and roundBitsMask == -1.
6340              */
6341             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6342                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6343             }
6344             break;
6345         default:
6346             abort();
6347         }
6348         z.low &= ~ roundBitsMask;
6349     }
6350     else {
6351         if ( aExp < 0x3FFF ) {
6352             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6353             status->float_exception_flags |= float_flag_inexact;
6354             aSign = extractFloat128Sign( a );
6355             switch (status->float_rounding_mode) {
6356             case float_round_nearest_even:
6357                 if (    ( aExp == 0x3FFE )
6358                      && (   extractFloat128Frac0( a )
6359                           | extractFloat128Frac1( a ) )
6360                    ) {
6361                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6362                 }
6363                 break;
6364             case float_round_ties_away:
6365                 if (aExp == 0x3FFE) {
6366                     return packFloat128(aSign, 0x3FFF, 0, 0);
6367                 }
6368                 break;
6369             case float_round_down:
6370                 return
6371                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6372                     : packFloat128( 0, 0, 0, 0 );
6373             case float_round_up:
6374                 return
6375                       aSign ? packFloat128( 1, 0, 0, 0 )
6376                     : packFloat128( 0, 0x3FFF, 0, 0 );
6377 
6378             case float_round_to_odd:
6379                 return packFloat128(aSign, 0x3FFF, 0, 0);
6380 
6381             case float_round_to_zero:
6382                 break;
6383             }
6384             return packFloat128( aSign, 0, 0, 0 );
6385         }
6386         lastBitMask = 1;
6387         lastBitMask <<= 0x402F - aExp;
6388         roundBitsMask = lastBitMask - 1;
6389         z.low = 0;
6390         z.high = a.high;
6391         switch (status->float_rounding_mode) {
6392         case float_round_nearest_even:
6393             z.high += lastBitMask>>1;
6394             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6395                 z.high &= ~ lastBitMask;
6396             }
6397             break;
6398         case float_round_ties_away:
6399             z.high += lastBitMask>>1;
6400             break;
6401         case float_round_to_zero:
6402             break;
6403         case float_round_up:
6404             if (!extractFloat128Sign(z)) {
6405                 z.high |= ( a.low != 0 );
6406                 z.high += roundBitsMask;
6407             }
6408             break;
6409         case float_round_down:
6410             if (extractFloat128Sign(z)) {
6411                 z.high |= (a.low != 0);
6412                 z.high += roundBitsMask;
6413             }
6414             break;
6415         case float_round_to_odd:
6416             if ((z.high & lastBitMask) == 0) {
6417                 z.high |= (a.low != 0);
6418                 z.high += roundBitsMask;
6419             }
6420             break;
6421         default:
6422             abort();
6423         }
6424         z.high &= ~ roundBitsMask;
6425     }
6426     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6427         status->float_exception_flags |= float_flag_inexact;
6428     }
6429     return z;
6430 
6431 }
6432 
6433 /*----------------------------------------------------------------------------
6434 | Returns the result of adding the absolute values of the quadruple-precision
6435 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6436 | before being returned.  `zSign' is ignored if the result is a NaN.
6437 | The addition is performed according to the IEC/IEEE Standard for Binary
6438 | Floating-Point Arithmetic.
6439 *----------------------------------------------------------------------------*/
6440 
6441 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6442                                 float_status *status)
6443 {
6444     int32_t aExp, bExp, zExp;
6445     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6446     int32_t expDiff;
6447 
6448     aSig1 = extractFloat128Frac1( a );
6449     aSig0 = extractFloat128Frac0( a );
6450     aExp = extractFloat128Exp( a );
6451     bSig1 = extractFloat128Frac1( b );
6452     bSig0 = extractFloat128Frac0( b );
6453     bExp = extractFloat128Exp( b );
6454     expDiff = aExp - bExp;
6455     if ( 0 < expDiff ) {
6456         if ( aExp == 0x7FFF ) {
6457             if (aSig0 | aSig1) {
6458                 return propagateFloat128NaN(a, b, status);
6459             }
6460             return a;
6461         }
6462         if ( bExp == 0 ) {
6463             --expDiff;
6464         }
6465         else {
6466             bSig0 |= UINT64_C(0x0001000000000000);
6467         }
6468         shift128ExtraRightJamming(
6469             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6470         zExp = aExp;
6471     }
6472     else if ( expDiff < 0 ) {
6473         if ( bExp == 0x7FFF ) {
6474             if (bSig0 | bSig1) {
6475                 return propagateFloat128NaN(a, b, status);
6476             }
6477             return packFloat128( zSign, 0x7FFF, 0, 0 );
6478         }
6479         if ( aExp == 0 ) {
6480             ++expDiff;
6481         }
6482         else {
6483             aSig0 |= UINT64_C(0x0001000000000000);
6484         }
6485         shift128ExtraRightJamming(
6486             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6487         zExp = bExp;
6488     }
6489     else {
6490         if ( aExp == 0x7FFF ) {
6491             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6492                 return propagateFloat128NaN(a, b, status);
6493             }
6494             return a;
6495         }
6496         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6497         if ( aExp == 0 ) {
6498             if (status->flush_to_zero) {
6499                 if (zSig0 | zSig1) {
6500                     float_raise(float_flag_output_denormal, status);
6501                 }
6502                 return packFloat128(zSign, 0, 0, 0);
6503             }
6504             return packFloat128( zSign, 0, zSig0, zSig1 );
6505         }
6506         zSig2 = 0;
6507         zSig0 |= UINT64_C(0x0002000000000000);
6508         zExp = aExp;
6509         goto shiftRight1;
6510     }
6511     aSig0 |= UINT64_C(0x0001000000000000);
6512     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6513     --zExp;
6514     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6515     ++zExp;
6516  shiftRight1:
6517     shift128ExtraRightJamming(
6518         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6519  roundAndPack:
6520     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6521 
6522 }
6523 
6524 /*----------------------------------------------------------------------------
6525 | Returns the result of subtracting the absolute values of the quadruple-
6526 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6527 | difference is negated before being returned.  `zSign' is ignored if the
6528 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6529 | Standard for Binary Floating-Point Arithmetic.
6530 *----------------------------------------------------------------------------*/
6531 
6532 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
6533                                 float_status *status)
6534 {
6535     int32_t aExp, bExp, zExp;
6536     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6537     int32_t expDiff;
6538 
6539     aSig1 = extractFloat128Frac1( a );
6540     aSig0 = extractFloat128Frac0( a );
6541     aExp = extractFloat128Exp( a );
6542     bSig1 = extractFloat128Frac1( b );
6543     bSig0 = extractFloat128Frac0( b );
6544     bExp = extractFloat128Exp( b );
6545     expDiff = aExp - bExp;
6546     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6547     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6548     if ( 0 < expDiff ) goto aExpBigger;
6549     if ( expDiff < 0 ) goto bExpBigger;
6550     if ( aExp == 0x7FFF ) {
6551         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6552             return propagateFloat128NaN(a, b, status);
6553         }
6554         float_raise(float_flag_invalid, status);
6555         return float128_default_nan(status);
6556     }
6557     if ( aExp == 0 ) {
6558         aExp = 1;
6559         bExp = 1;
6560     }
6561     if ( bSig0 < aSig0 ) goto aBigger;
6562     if ( aSig0 < bSig0 ) goto bBigger;
6563     if ( bSig1 < aSig1 ) goto aBigger;
6564     if ( aSig1 < bSig1 ) goto bBigger;
6565     return packFloat128(status->float_rounding_mode == float_round_down,
6566                         0, 0, 0);
6567  bExpBigger:
6568     if ( bExp == 0x7FFF ) {
6569         if (bSig0 | bSig1) {
6570             return propagateFloat128NaN(a, b, status);
6571         }
6572         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6573     }
6574     if ( aExp == 0 ) {
6575         ++expDiff;
6576     }
6577     else {
6578         aSig0 |= UINT64_C(0x4000000000000000);
6579     }
6580     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6581     bSig0 |= UINT64_C(0x4000000000000000);
6582  bBigger:
6583     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6584     zExp = bExp;
6585     zSign ^= 1;
6586     goto normalizeRoundAndPack;
6587  aExpBigger:
6588     if ( aExp == 0x7FFF ) {
6589         if (aSig0 | aSig1) {
6590             return propagateFloat128NaN(a, b, status);
6591         }
6592         return a;
6593     }
6594     if ( bExp == 0 ) {
6595         --expDiff;
6596     }
6597     else {
6598         bSig0 |= UINT64_C(0x4000000000000000);
6599     }
6600     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6601     aSig0 |= UINT64_C(0x4000000000000000);
6602  aBigger:
6603     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6604     zExp = aExp;
6605  normalizeRoundAndPack:
6606     --zExp;
6607     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6608                                          status);
6609 
6610 }
6611 
6612 /*----------------------------------------------------------------------------
6613 | Returns the result of adding the quadruple-precision floating-point values
6614 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6615 | for Binary Floating-Point Arithmetic.
6616 *----------------------------------------------------------------------------*/
6617 
6618 float128 float128_add(float128 a, float128 b, float_status *status)
6619 {
6620     bool aSign, bSign;
6621 
6622     aSign = extractFloat128Sign( a );
6623     bSign = extractFloat128Sign( b );
6624     if ( aSign == bSign ) {
6625         return addFloat128Sigs(a, b, aSign, status);
6626     }
6627     else {
6628         return subFloat128Sigs(a, b, aSign, status);
6629     }
6630 
6631 }
6632 
6633 /*----------------------------------------------------------------------------
6634 | Returns the result of subtracting the quadruple-precision floating-point
6635 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6636 | Standard for Binary Floating-Point Arithmetic.
6637 *----------------------------------------------------------------------------*/
6638 
6639 float128 float128_sub(float128 a, float128 b, float_status *status)
6640 {
6641     bool aSign, bSign;
6642 
6643     aSign = extractFloat128Sign( a );
6644     bSign = extractFloat128Sign( b );
6645     if ( aSign == bSign ) {
6646         return subFloat128Sigs(a, b, aSign, status);
6647     }
6648     else {
6649         return addFloat128Sigs(a, b, aSign, status);
6650     }
6651 
6652 }
6653 
6654 /*----------------------------------------------------------------------------
6655 | Returns the result of multiplying the quadruple-precision floating-point
6656 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6657 | Standard for Binary Floating-Point Arithmetic.
6658 *----------------------------------------------------------------------------*/
6659 
6660 float128 float128_mul(float128 a, float128 b, float_status *status)
6661 {
6662     bool aSign, bSign, zSign;
6663     int32_t aExp, bExp, zExp;
6664     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6665 
6666     aSig1 = extractFloat128Frac1( a );
6667     aSig0 = extractFloat128Frac0( a );
6668     aExp = extractFloat128Exp( a );
6669     aSign = extractFloat128Sign( a );
6670     bSig1 = extractFloat128Frac1( b );
6671     bSig0 = extractFloat128Frac0( b );
6672     bExp = extractFloat128Exp( b );
6673     bSign = extractFloat128Sign( b );
6674     zSign = aSign ^ bSign;
6675     if ( aExp == 0x7FFF ) {
6676         if (    ( aSig0 | aSig1 )
6677              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6678             return propagateFloat128NaN(a, b, status);
6679         }
6680         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6681         return packFloat128( zSign, 0x7FFF, 0, 0 );
6682     }
6683     if ( bExp == 0x7FFF ) {
6684         if (bSig0 | bSig1) {
6685             return propagateFloat128NaN(a, b, status);
6686         }
6687         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6688  invalid:
6689             float_raise(float_flag_invalid, status);
6690             return float128_default_nan(status);
6691         }
6692         return packFloat128( zSign, 0x7FFF, 0, 0 );
6693     }
6694     if ( aExp == 0 ) {
6695         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6696         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6697     }
6698     if ( bExp == 0 ) {
6699         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6700         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6701     }
6702     zExp = aExp + bExp - 0x4000;
6703     aSig0 |= UINT64_C(0x0001000000000000);
6704     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6705     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6706     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6707     zSig2 |= ( zSig3 != 0 );
6708     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
6709         shift128ExtraRightJamming(
6710             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6711         ++zExp;
6712     }
6713     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6714 
6715 }
6716 
6717 /*----------------------------------------------------------------------------
6718 | Returns the result of dividing the quadruple-precision floating-point value
6719 | `a' by the corresponding value `b'.  The operation is performed according to
6720 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6721 *----------------------------------------------------------------------------*/
6722 
6723 float128 float128_div(float128 a, float128 b, float_status *status)
6724 {
6725     bool aSign, bSign, zSign;
6726     int32_t aExp, bExp, zExp;
6727     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6728     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6729 
6730     aSig1 = extractFloat128Frac1( a );
6731     aSig0 = extractFloat128Frac0( a );
6732     aExp = extractFloat128Exp( a );
6733     aSign = extractFloat128Sign( a );
6734     bSig1 = extractFloat128Frac1( b );
6735     bSig0 = extractFloat128Frac0( b );
6736     bExp = extractFloat128Exp( b );
6737     bSign = extractFloat128Sign( b );
6738     zSign = aSign ^ bSign;
6739     if ( aExp == 0x7FFF ) {
6740         if (aSig0 | aSig1) {
6741             return propagateFloat128NaN(a, b, status);
6742         }
6743         if ( bExp == 0x7FFF ) {
6744             if (bSig0 | bSig1) {
6745                 return propagateFloat128NaN(a, b, status);
6746             }
6747             goto invalid;
6748         }
6749         return packFloat128( zSign, 0x7FFF, 0, 0 );
6750     }
6751     if ( bExp == 0x7FFF ) {
6752         if (bSig0 | bSig1) {
6753             return propagateFloat128NaN(a, b, status);
6754         }
6755         return packFloat128( zSign, 0, 0, 0 );
6756     }
6757     if ( bExp == 0 ) {
6758         if ( ( bSig0 | bSig1 ) == 0 ) {
6759             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6760  invalid:
6761                 float_raise(float_flag_invalid, status);
6762                 return float128_default_nan(status);
6763             }
6764             float_raise(float_flag_divbyzero, status);
6765             return packFloat128( zSign, 0x7FFF, 0, 0 );
6766         }
6767         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6768     }
6769     if ( aExp == 0 ) {
6770         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6771         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6772     }
6773     zExp = aExp - bExp + 0x3FFD;
6774     shortShift128Left(
6775         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
6776     shortShift128Left(
6777         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6778     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6779         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6780         ++zExp;
6781     }
6782     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6783     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6784     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6785     while ( (int64_t) rem0 < 0 ) {
6786         --zSig0;
6787         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6788     }
6789     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6790     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6791         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6792         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6793         while ( (int64_t) rem1 < 0 ) {
6794             --zSig1;
6795             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6796         }
6797         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6798     }
6799     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6800     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6801 
6802 }
6803 
6804 /*----------------------------------------------------------------------------
6805 | Returns the remainder of the quadruple-precision floating-point value `a'
6806 | with respect to the corresponding value `b'.  The operation is performed
6807 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6808 *----------------------------------------------------------------------------*/
6809 
6810 float128 float128_rem(float128 a, float128 b, float_status *status)
6811 {
6812     bool aSign, zSign;
6813     int32_t aExp, bExp, expDiff;
6814     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6815     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6816     int64_t sigMean0;
6817 
6818     aSig1 = extractFloat128Frac1( a );
6819     aSig0 = extractFloat128Frac0( a );
6820     aExp = extractFloat128Exp( a );
6821     aSign = extractFloat128Sign( a );
6822     bSig1 = extractFloat128Frac1( b );
6823     bSig0 = extractFloat128Frac0( b );
6824     bExp = extractFloat128Exp( b );
6825     if ( aExp == 0x7FFF ) {
6826         if (    ( aSig0 | aSig1 )
6827              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6828             return propagateFloat128NaN(a, b, status);
6829         }
6830         goto invalid;
6831     }
6832     if ( bExp == 0x7FFF ) {
6833         if (bSig0 | bSig1) {
6834             return propagateFloat128NaN(a, b, status);
6835         }
6836         return a;
6837     }
6838     if ( bExp == 0 ) {
6839         if ( ( bSig0 | bSig1 ) == 0 ) {
6840  invalid:
6841             float_raise(float_flag_invalid, status);
6842             return float128_default_nan(status);
6843         }
6844         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6845     }
6846     if ( aExp == 0 ) {
6847         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6848         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6849     }
6850     expDiff = aExp - bExp;
6851     if ( expDiff < -1 ) return a;
6852     shortShift128Left(
6853         aSig0 | UINT64_C(0x0001000000000000),
6854         aSig1,
6855         15 - ( expDiff < 0 ),
6856         &aSig0,
6857         &aSig1
6858     );
6859     shortShift128Left(
6860         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
6861     q = le128( bSig0, bSig1, aSig0, aSig1 );
6862     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6863     expDiff -= 64;
6864     while ( 0 < expDiff ) {
6865         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6866         q = ( 4 < q ) ? q - 4 : 0;
6867         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6868         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6869         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6870         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6871         expDiff -= 61;
6872     }
6873     if ( -64 < expDiff ) {
6874         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6875         q = ( 4 < q ) ? q - 4 : 0;
6876         q >>= - expDiff;
6877         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6878         expDiff += 52;
6879         if ( expDiff < 0 ) {
6880             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6881         }
6882         else {
6883             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6884         }
6885         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6886         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6887     }
6888     else {
6889         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6890         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6891     }
6892     do {
6893         alternateASig0 = aSig0;
6894         alternateASig1 = aSig1;
6895         ++q;
6896         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6897     } while ( 0 <= (int64_t) aSig0 );
6898     add128(
6899         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6900     if (    ( sigMean0 < 0 )
6901          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6902         aSig0 = alternateASig0;
6903         aSig1 = alternateASig1;
6904     }
6905     zSign = ( (int64_t) aSig0 < 0 );
6906     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6907     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6908                                          status);
6909 }
6910 
6911 /*----------------------------------------------------------------------------
6912 | Returns the square root of the quadruple-precision floating-point value `a'.
6913 | The operation is performed according to the IEC/IEEE Standard for Binary
6914 | Floating-Point Arithmetic.
6915 *----------------------------------------------------------------------------*/
6916 
6917 float128 float128_sqrt(float128 a, float_status *status)
6918 {
6919     bool aSign;
6920     int32_t aExp, zExp;
6921     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6922     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6923 
6924     aSig1 = extractFloat128Frac1( a );
6925     aSig0 = extractFloat128Frac0( a );
6926     aExp = extractFloat128Exp( a );
6927     aSign = extractFloat128Sign( a );
6928     if ( aExp == 0x7FFF ) {
6929         if (aSig0 | aSig1) {
6930             return propagateFloat128NaN(a, a, status);
6931         }
6932         if ( ! aSign ) return a;
6933         goto invalid;
6934     }
6935     if ( aSign ) {
6936         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6937  invalid:
6938         float_raise(float_flag_invalid, status);
6939         return float128_default_nan(status);
6940     }
6941     if ( aExp == 0 ) {
6942         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6943         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6944     }
6945     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6946     aSig0 |= UINT64_C(0x0001000000000000);
6947     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6948     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6949     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6950     doubleZSig0 = zSig0<<1;
6951     mul64To128( zSig0, zSig0, &term0, &term1 );
6952     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6953     while ( (int64_t) rem0 < 0 ) {
6954         --zSig0;
6955         doubleZSig0 -= 2;
6956         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6957     }
6958     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6959     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6960         if ( zSig1 == 0 ) zSig1 = 1;
6961         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6962         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6963         mul64To128( zSig1, zSig1, &term2, &term3 );
6964         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6965         while ( (int64_t) rem1 < 0 ) {
6966             --zSig1;
6967             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6968             term3 |= 1;
6969             term2 |= doubleZSig0;
6970             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6971         }
6972         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6973     }
6974     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6975     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6976 
6977 }
6978 
6979 static inline FloatRelation
6980 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
6981                           float_status *status)
6982 {
6983     bool aSign, bSign;
6984 
6985     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6986         float_raise(float_flag_invalid, status);
6987         return float_relation_unordered;
6988     }
6989     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6990           ( extractFloatx80Frac( a )<<1 ) ) ||
6991         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6992           ( extractFloatx80Frac( b )<<1 ) )) {
6993         if (!is_quiet ||
6994             floatx80_is_signaling_nan(a, status) ||
6995             floatx80_is_signaling_nan(b, status)) {
6996             float_raise(float_flag_invalid, status);
6997         }
6998         return float_relation_unordered;
6999     }
7000     aSign = extractFloatx80Sign( a );
7001     bSign = extractFloatx80Sign( b );
7002     if ( aSign != bSign ) {
7003 
7004         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7005              ( ( a.low | b.low ) == 0 ) ) {
7006             /* zero case */
7007             return float_relation_equal;
7008         } else {
7009             return 1 - (2 * aSign);
7010         }
7011     } else {
7012         /* Normalize pseudo-denormals before comparison.  */
7013         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7014             ++a.high;
7015         }
7016         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7017             ++b.high;
7018         }
7019         if (a.low == b.low && a.high == b.high) {
7020             return float_relation_equal;
7021         } else {
7022             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7023         }
7024     }
7025 }
7026 
7027 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7028 {
7029     return floatx80_compare_internal(a, b, 0, status);
7030 }
7031 
7032 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7033                                      float_status *status)
7034 {
7035     return floatx80_compare_internal(a, b, 1, status);
7036 }
7037 
7038 static inline FloatRelation
7039 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7040                           float_status *status)
7041 {
7042     bool aSign, bSign;
7043 
7044     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7045           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7046         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7047           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7048         if (!is_quiet ||
7049             float128_is_signaling_nan(a, status) ||
7050             float128_is_signaling_nan(b, status)) {
7051             float_raise(float_flag_invalid, status);
7052         }
7053         return float_relation_unordered;
7054     }
7055     aSign = extractFloat128Sign( a );
7056     bSign = extractFloat128Sign( b );
7057     if ( aSign != bSign ) {
7058         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7059             /* zero case */
7060             return float_relation_equal;
7061         } else {
7062             return 1 - (2 * aSign);
7063         }
7064     } else {
7065         if (a.low == b.low && a.high == b.high) {
7066             return float_relation_equal;
7067         } else {
7068             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7069         }
7070     }
7071 }
7072 
7073 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7074 {
7075     return float128_compare_internal(a, b, 0, status);
7076 }
7077 
7078 FloatRelation float128_compare_quiet(float128 a, float128 b,
7079                                      float_status *status)
7080 {
7081     return float128_compare_internal(a, b, 1, status);
7082 }
7083 
7084 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7085 {
7086     bool aSign;
7087     int32_t aExp;
7088     uint64_t aSig;
7089 
7090     if (floatx80_invalid_encoding(a)) {
7091         float_raise(float_flag_invalid, status);
7092         return floatx80_default_nan(status);
7093     }
7094     aSig = extractFloatx80Frac( a );
7095     aExp = extractFloatx80Exp( a );
7096     aSign = extractFloatx80Sign( a );
7097 
7098     if ( aExp == 0x7FFF ) {
7099         if ( aSig<<1 ) {
7100             return propagateFloatx80NaN(a, a, status);
7101         }
7102         return a;
7103     }
7104 
7105     if (aExp == 0) {
7106         if (aSig == 0) {
7107             return a;
7108         }
7109         aExp++;
7110     }
7111 
7112     if (n > 0x10000) {
7113         n = 0x10000;
7114     } else if (n < -0x10000) {
7115         n = -0x10000;
7116     }
7117 
7118     aExp += n;
7119     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7120                                          aSign, aExp, aSig, 0, status);
7121 }
7122 
7123 float128 float128_scalbn(float128 a, int n, float_status *status)
7124 {
7125     bool aSign;
7126     int32_t aExp;
7127     uint64_t aSig0, aSig1;
7128 
7129     aSig1 = extractFloat128Frac1( a );
7130     aSig0 = extractFloat128Frac0( a );
7131     aExp = extractFloat128Exp( a );
7132     aSign = extractFloat128Sign( a );
7133     if ( aExp == 0x7FFF ) {
7134         if ( aSig0 | aSig1 ) {
7135             return propagateFloat128NaN(a, a, status);
7136         }
7137         return a;
7138     }
7139     if (aExp != 0) {
7140         aSig0 |= UINT64_C(0x0001000000000000);
7141     } else if (aSig0 == 0 && aSig1 == 0) {
7142         return a;
7143     } else {
7144         aExp++;
7145     }
7146 
7147     if (n > 0x10000) {
7148         n = 0x10000;
7149     } else if (n < -0x10000) {
7150         n = -0x10000;
7151     }
7152 
7153     aExp += n - 1;
7154     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7155                                          , status);
7156 
7157 }
7158 
7159 static void __attribute__((constructor)) softfloat_init(void)
7160 {
7161     union_float64 ua, ub, uc, ur;
7162 
7163     if (QEMU_NO_HARDFLOAT) {
7164         return;
7165     }
7166     /*
7167      * Test that the host's FMA is not obviously broken. For example,
7168      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7169      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7170      */
7171     ua.s = 0x0020000000000001ULL;
7172     ub.s = 0x3ca0000000000000ULL;
7173     uc.s = 0x0020000000000000ULL;
7174     ur.h = fma(ua.h, ub.h, uc.h);
7175     if (ur.s != 0x0020000000000001ULL) {
7176         force_soft_fma = true;
7177     }
7178 }
7179