xref: /openbmc/qemu/fpu/softfloat.c (revision b0476d66)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 static inline float32
343 float32_gen2(float32 xa, float32 xb, float_status *s,
344              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
345              f32_check_fn pre, f32_check_fn post)
346 {
347     union_float32 ua, ub, ur;
348 
349     ua.s = xa;
350     ub.s = xb;
351 
352     if (unlikely(!can_use_fpu(s))) {
353         goto soft;
354     }
355 
356     float32_input_flush2(&ua.s, &ub.s, s);
357     if (unlikely(!pre(ua, ub))) {
358         goto soft;
359     }
360 
361     ur.h = hard(ua.h, ub.h);
362     if (unlikely(f32_is_inf(ur))) {
363         s->float_exception_flags |= float_flag_overflow;
364     } else if (unlikely(fabsf(ur.h) <= FLT_MIN) && post(ua, ub)) {
365         goto soft;
366     }
367     return ur.s;
368 
369  soft:
370     return soft(ua.s, ub.s, s);
371 }
372 
373 static inline float64
374 float64_gen2(float64 xa, float64 xb, float_status *s,
375              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
376              f64_check_fn pre, f64_check_fn post)
377 {
378     union_float64 ua, ub, ur;
379 
380     ua.s = xa;
381     ub.s = xb;
382 
383     if (unlikely(!can_use_fpu(s))) {
384         goto soft;
385     }
386 
387     float64_input_flush2(&ua.s, &ub.s, s);
388     if (unlikely(!pre(ua, ub))) {
389         goto soft;
390     }
391 
392     ur.h = hard(ua.h, ub.h);
393     if (unlikely(f64_is_inf(ur))) {
394         s->float_exception_flags |= float_flag_overflow;
395     } else if (unlikely(fabs(ur.h) <= DBL_MIN) && post(ua, ub)) {
396         goto soft;
397     }
398     return ur.s;
399 
400  soft:
401     return soft(ua.s, ub.s, s);
402 }
403 
404 /*----------------------------------------------------------------------------
405 | Returns the fraction bits of the single-precision floating-point value `a'.
406 *----------------------------------------------------------------------------*/
407 
408 static inline uint32_t extractFloat32Frac(float32 a)
409 {
410     return float32_val(a) & 0x007FFFFF;
411 }
412 
413 /*----------------------------------------------------------------------------
414 | Returns the exponent bits of the single-precision floating-point value `a'.
415 *----------------------------------------------------------------------------*/
416 
417 static inline int extractFloat32Exp(float32 a)
418 {
419     return (float32_val(a) >> 23) & 0xFF;
420 }
421 
422 /*----------------------------------------------------------------------------
423 | Returns the sign bit of the single-precision floating-point value `a'.
424 *----------------------------------------------------------------------------*/
425 
426 static inline bool extractFloat32Sign(float32 a)
427 {
428     return float32_val(a) >> 31;
429 }
430 
431 /*----------------------------------------------------------------------------
432 | Returns the fraction bits of the double-precision floating-point value `a'.
433 *----------------------------------------------------------------------------*/
434 
435 static inline uint64_t extractFloat64Frac(float64 a)
436 {
437     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
438 }
439 
440 /*----------------------------------------------------------------------------
441 | Returns the exponent bits of the double-precision floating-point value `a'.
442 *----------------------------------------------------------------------------*/
443 
444 static inline int extractFloat64Exp(float64 a)
445 {
446     return (float64_val(a) >> 52) & 0x7FF;
447 }
448 
449 /*----------------------------------------------------------------------------
450 | Returns the sign bit of the double-precision floating-point value `a'.
451 *----------------------------------------------------------------------------*/
452 
453 static inline bool extractFloat64Sign(float64 a)
454 {
455     return float64_val(a) >> 63;
456 }
457 
458 /*
459  * Classify a floating point number. Everything above float_class_qnan
460  * is a NaN so cls >= float_class_qnan is any NaN.
461  */
462 
463 typedef enum __attribute__ ((__packed__)) {
464     float_class_unclassified,
465     float_class_zero,
466     float_class_normal,
467     float_class_inf,
468     float_class_qnan,  /* all NaNs from here */
469     float_class_snan,
470 } FloatClass;
471 
472 /* Simple helpers for checking if, or what kind of, NaN we have */
473 static inline __attribute__((unused)) bool is_nan(FloatClass c)
474 {
475     return unlikely(c >= float_class_qnan);
476 }
477 
478 static inline __attribute__((unused)) bool is_snan(FloatClass c)
479 {
480     return c == float_class_snan;
481 }
482 
483 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
484 {
485     return c == float_class_qnan;
486 }
487 
488 /*
489  * Structure holding all of the decomposed parts of a float. The
490  * exponent is unbiased and the fraction is normalized. All
491  * calculations are done with a 64 bit fraction and then rounded as
492  * appropriate for the final format.
493  *
494  * Thanks to the packed FloatClass a decent compiler should be able to
495  * fit the whole structure into registers and avoid using the stack
496  * for parameter passing.
497  */
498 
499 typedef struct {
500     uint64_t frac;
501     int32_t  exp;
502     FloatClass cls;
503     bool sign;
504 } FloatParts;
505 
506 #define DECOMPOSED_BINARY_POINT    (64 - 2)
507 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
508 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
509 
510 /* Structure holding all of the relevant parameters for a format.
511  *   exp_size: the size of the exponent field
512  *   exp_bias: the offset applied to the exponent field
513  *   exp_max: the maximum normalised exponent
514  *   frac_size: the size of the fraction field
515  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
516  * The following are computed based the size of fraction
517  *   frac_lsb: least significant bit of fraction
518  *   frac_lsbm1: the bit below the least significant bit (for rounding)
519  *   round_mask/roundeven_mask: masks used for rounding
520  * The following optional modifiers are available:
521  *   arm_althp: handle ARM Alternative Half Precision
522  */
523 typedef struct {
524     int exp_size;
525     int exp_bias;
526     int exp_max;
527     int frac_size;
528     int frac_shift;
529     uint64_t frac_lsb;
530     uint64_t frac_lsbm1;
531     uint64_t round_mask;
532     uint64_t roundeven_mask;
533     bool arm_althp;
534 } FloatFmt;
535 
536 /* Expand fields based on the size of exponent and fraction */
537 #define FLOAT_PARAMS(E, F)                                           \
538     .exp_size       = E,                                             \
539     .exp_bias       = ((1 << E) - 1) >> 1,                           \
540     .exp_max        = (1 << E) - 1,                                  \
541     .frac_size      = F,                                             \
542     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
543     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
544     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
545     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
546     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
547 
548 static const FloatFmt float16_params = {
549     FLOAT_PARAMS(5, 10)
550 };
551 
552 static const FloatFmt float16_params_ahp = {
553     FLOAT_PARAMS(5, 10),
554     .arm_althp = true
555 };
556 
557 static const FloatFmt bfloat16_params = {
558     FLOAT_PARAMS(8, 7)
559 };
560 
561 static const FloatFmt float32_params = {
562     FLOAT_PARAMS(8, 23)
563 };
564 
565 static const FloatFmt float64_params = {
566     FLOAT_PARAMS(11, 52)
567 };
568 
569 /* Unpack a float to parts, but do not canonicalize.  */
570 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
571 {
572     const int sign_pos = fmt.frac_size + fmt.exp_size;
573 
574     return (FloatParts) {
575         .cls = float_class_unclassified,
576         .sign = extract64(raw, sign_pos, 1),
577         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
578         .frac = extract64(raw, 0, fmt.frac_size),
579     };
580 }
581 
582 static inline FloatParts float16_unpack_raw(float16 f)
583 {
584     return unpack_raw(float16_params, f);
585 }
586 
587 static inline FloatParts bfloat16_unpack_raw(bfloat16 f)
588 {
589     return unpack_raw(bfloat16_params, f);
590 }
591 
592 static inline FloatParts float32_unpack_raw(float32 f)
593 {
594     return unpack_raw(float32_params, f);
595 }
596 
597 static inline FloatParts float64_unpack_raw(float64 f)
598 {
599     return unpack_raw(float64_params, f);
600 }
601 
602 /* Pack a float from parts, but do not canonicalize.  */
603 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
604 {
605     const int sign_pos = fmt.frac_size + fmt.exp_size;
606     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
607     return deposit64(ret, sign_pos, 1, p.sign);
608 }
609 
610 static inline float16 float16_pack_raw(FloatParts p)
611 {
612     return make_float16(pack_raw(float16_params, p));
613 }
614 
615 static inline bfloat16 bfloat16_pack_raw(FloatParts p)
616 {
617     return pack_raw(bfloat16_params, p);
618 }
619 
620 static inline float32 float32_pack_raw(FloatParts p)
621 {
622     return make_float32(pack_raw(float32_params, p));
623 }
624 
625 static inline float64 float64_pack_raw(FloatParts p)
626 {
627     return make_float64(pack_raw(float64_params, p));
628 }
629 
630 /*----------------------------------------------------------------------------
631 | Functions and definitions to determine:  (1) whether tininess for underflow
632 | is detected before or after rounding by default, (2) what (if anything)
633 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
634 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
635 | are propagated from function inputs to output.  These details are target-
636 | specific.
637 *----------------------------------------------------------------------------*/
638 #include "softfloat-specialize.c.inc"
639 
640 /* Canonicalize EXP and FRAC, setting CLS.  */
641 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
642                                   float_status *status)
643 {
644     if (part.exp == parm->exp_max && !parm->arm_althp) {
645         if (part.frac == 0) {
646             part.cls = float_class_inf;
647         } else {
648             part.frac <<= parm->frac_shift;
649             part.cls = (parts_is_snan_frac(part.frac, status)
650                         ? float_class_snan : float_class_qnan);
651         }
652     } else if (part.exp == 0) {
653         if (likely(part.frac == 0)) {
654             part.cls = float_class_zero;
655         } else if (status->flush_inputs_to_zero) {
656             float_raise(float_flag_input_denormal, status);
657             part.cls = float_class_zero;
658             part.frac = 0;
659         } else {
660             int shift = clz64(part.frac) - 1;
661             part.cls = float_class_normal;
662             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
663             part.frac <<= shift;
664         }
665     } else {
666         part.cls = float_class_normal;
667         part.exp -= parm->exp_bias;
668         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
669     }
670     return part;
671 }
672 
673 /* Round and uncanonicalize a floating-point number by parts. There
674  * are FRAC_SHIFT bits that may require rounding at the bottom of the
675  * fraction; these bits will be removed. The exponent will be biased
676  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
677  */
678 
679 static FloatParts round_canonical(FloatParts p, float_status *s,
680                                   const FloatFmt *parm)
681 {
682     const uint64_t frac_lsb = parm->frac_lsb;
683     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
684     const uint64_t round_mask = parm->round_mask;
685     const uint64_t roundeven_mask = parm->roundeven_mask;
686     const int exp_max = parm->exp_max;
687     const int frac_shift = parm->frac_shift;
688     uint64_t frac, inc;
689     int exp, flags = 0;
690     bool overflow_norm;
691 
692     frac = p.frac;
693     exp = p.exp;
694 
695     switch (p.cls) {
696     case float_class_normal:
697         switch (s->float_rounding_mode) {
698         case float_round_nearest_even:
699             overflow_norm = false;
700             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
701             break;
702         case float_round_ties_away:
703             overflow_norm = false;
704             inc = frac_lsbm1;
705             break;
706         case float_round_to_zero:
707             overflow_norm = true;
708             inc = 0;
709             break;
710         case float_round_up:
711             inc = p.sign ? 0 : round_mask;
712             overflow_norm = p.sign;
713             break;
714         case float_round_down:
715             inc = p.sign ? round_mask : 0;
716             overflow_norm = !p.sign;
717             break;
718         case float_round_to_odd:
719             overflow_norm = true;
720             inc = frac & frac_lsb ? 0 : round_mask;
721             break;
722         default:
723             g_assert_not_reached();
724         }
725 
726         exp += parm->exp_bias;
727         if (likely(exp > 0)) {
728             if (frac & round_mask) {
729                 flags |= float_flag_inexact;
730                 frac += inc;
731                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
732                     frac >>= 1;
733                     exp++;
734                 }
735             }
736             frac >>= frac_shift;
737 
738             if (parm->arm_althp) {
739                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
740                 if (unlikely(exp > exp_max)) {
741                     /* Overflow.  Return the maximum normal.  */
742                     flags = float_flag_invalid;
743                     exp = exp_max;
744                     frac = -1;
745                 }
746             } else if (unlikely(exp >= exp_max)) {
747                 flags |= float_flag_overflow | float_flag_inexact;
748                 if (overflow_norm) {
749                     exp = exp_max - 1;
750                     frac = -1;
751                 } else {
752                     p.cls = float_class_inf;
753                     goto do_inf;
754                 }
755             }
756         } else if (s->flush_to_zero) {
757             flags |= float_flag_output_denormal;
758             p.cls = float_class_zero;
759             goto do_zero;
760         } else {
761             bool is_tiny = s->tininess_before_rounding
762                         || (exp < 0)
763                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
764 
765             shift64RightJamming(frac, 1 - exp, &frac);
766             if (frac & round_mask) {
767                 /* Need to recompute round-to-even.  */
768                 switch (s->float_rounding_mode) {
769                 case float_round_nearest_even:
770                     inc = ((frac & roundeven_mask) != frac_lsbm1
771                            ? frac_lsbm1 : 0);
772                     break;
773                 case float_round_to_odd:
774                     inc = frac & frac_lsb ? 0 : round_mask;
775                     break;
776                 default:
777                     break;
778                 }
779                 flags |= float_flag_inexact;
780                 frac += inc;
781             }
782 
783             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
784             frac >>= frac_shift;
785 
786             if (is_tiny && (flags & float_flag_inexact)) {
787                 flags |= float_flag_underflow;
788             }
789             if (exp == 0 && frac == 0) {
790                 p.cls = float_class_zero;
791             }
792         }
793         break;
794 
795     case float_class_zero:
796     do_zero:
797         exp = 0;
798         frac = 0;
799         break;
800 
801     case float_class_inf:
802     do_inf:
803         assert(!parm->arm_althp);
804         exp = exp_max;
805         frac = 0;
806         break;
807 
808     case float_class_qnan:
809     case float_class_snan:
810         assert(!parm->arm_althp);
811         exp = exp_max;
812         frac >>= parm->frac_shift;
813         break;
814 
815     default:
816         g_assert_not_reached();
817     }
818 
819     float_raise(flags, s);
820     p.exp = exp;
821     p.frac = frac;
822     return p;
823 }
824 
825 /* Explicit FloatFmt version */
826 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
827                                             const FloatFmt *params)
828 {
829     return sf_canonicalize(float16_unpack_raw(f), params, s);
830 }
831 
832 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
833 {
834     return float16a_unpack_canonical(f, s, &float16_params);
835 }
836 
837 static FloatParts bfloat16_unpack_canonical(bfloat16 f, float_status *s)
838 {
839     return sf_canonicalize(bfloat16_unpack_raw(f), &bfloat16_params, s);
840 }
841 
842 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
843                                              const FloatFmt *params)
844 {
845     return float16_pack_raw(round_canonical(p, s, params));
846 }
847 
848 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
849 {
850     return float16a_round_pack_canonical(p, s, &float16_params);
851 }
852 
853 static bfloat16 bfloat16_round_pack_canonical(FloatParts p, float_status *s)
854 {
855     return bfloat16_pack_raw(round_canonical(p, s, &bfloat16_params));
856 }
857 
858 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
859 {
860     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
861 }
862 
863 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
864 {
865     return float32_pack_raw(round_canonical(p, s, &float32_params));
866 }
867 
868 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
869 {
870     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
871 }
872 
873 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
874 {
875     return float64_pack_raw(round_canonical(p, s, &float64_params));
876 }
877 
878 static FloatParts return_nan(FloatParts a, float_status *s)
879 {
880     switch (a.cls) {
881     case float_class_snan:
882         s->float_exception_flags |= float_flag_invalid;
883         a = parts_silence_nan(a, s);
884         /* fall through */
885     case float_class_qnan:
886         if (s->default_nan_mode) {
887             return parts_default_nan(s);
888         }
889         break;
890 
891     default:
892         g_assert_not_reached();
893     }
894     return a;
895 }
896 
897 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
898 {
899     if (is_snan(a.cls) || is_snan(b.cls)) {
900         s->float_exception_flags |= float_flag_invalid;
901     }
902 
903     if (s->default_nan_mode) {
904         return parts_default_nan(s);
905     } else {
906         if (pickNaN(a.cls, b.cls,
907                     a.frac > b.frac ||
908                     (a.frac == b.frac && a.sign < b.sign), s)) {
909             a = b;
910         }
911         if (is_snan(a.cls)) {
912             return parts_silence_nan(a, s);
913         }
914     }
915     return a;
916 }
917 
918 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
919                                   bool inf_zero, float_status *s)
920 {
921     int which;
922 
923     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
924         s->float_exception_flags |= float_flag_invalid;
925     }
926 
927     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
928 
929     if (s->default_nan_mode) {
930         /* Note that this check is after pickNaNMulAdd so that function
931          * has an opportunity to set the Invalid flag.
932          */
933         which = 3;
934     }
935 
936     switch (which) {
937     case 0:
938         break;
939     case 1:
940         a = b;
941         break;
942     case 2:
943         a = c;
944         break;
945     case 3:
946         return parts_default_nan(s);
947     default:
948         g_assert_not_reached();
949     }
950 
951     if (is_snan(a.cls)) {
952         return parts_silence_nan(a, s);
953     }
954     return a;
955 }
956 
957 /*
958  * Returns the result of adding or subtracting the values of the
959  * floating-point values `a' and `b'. The operation is performed
960  * according to the IEC/IEEE Standard for Binary Floating-Point
961  * Arithmetic.
962  */
963 
964 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
965                                 float_status *s)
966 {
967     bool a_sign = a.sign;
968     bool b_sign = b.sign ^ subtract;
969 
970     if (a_sign != b_sign) {
971         /* Subtraction */
972 
973         if (a.cls == float_class_normal && b.cls == float_class_normal) {
974             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
975                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
976                 a.frac = a.frac - b.frac;
977             } else {
978                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
979                 a.frac = b.frac - a.frac;
980                 a.exp = b.exp;
981                 a_sign ^= 1;
982             }
983 
984             if (a.frac == 0) {
985                 a.cls = float_class_zero;
986                 a.sign = s->float_rounding_mode == float_round_down;
987             } else {
988                 int shift = clz64(a.frac) - 1;
989                 a.frac = a.frac << shift;
990                 a.exp = a.exp - shift;
991                 a.sign = a_sign;
992             }
993             return a;
994         }
995         if (is_nan(a.cls) || is_nan(b.cls)) {
996             return pick_nan(a, b, s);
997         }
998         if (a.cls == float_class_inf) {
999             if (b.cls == float_class_inf) {
1000                 float_raise(float_flag_invalid, s);
1001                 return parts_default_nan(s);
1002             }
1003             return a;
1004         }
1005         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1006             a.sign = s->float_rounding_mode == float_round_down;
1007             return a;
1008         }
1009         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1010             b.sign = a_sign ^ 1;
1011             return b;
1012         }
1013         if (b.cls == float_class_zero) {
1014             return a;
1015         }
1016     } else {
1017         /* Addition */
1018         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1019             if (a.exp > b.exp) {
1020                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1021             } else if (a.exp < b.exp) {
1022                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1023                 a.exp = b.exp;
1024             }
1025             a.frac += b.frac;
1026             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1027                 shift64RightJamming(a.frac, 1, &a.frac);
1028                 a.exp += 1;
1029             }
1030             return a;
1031         }
1032         if (is_nan(a.cls) || is_nan(b.cls)) {
1033             return pick_nan(a, b, s);
1034         }
1035         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1036             return a;
1037         }
1038         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1039             b.sign = b_sign;
1040             return b;
1041         }
1042     }
1043     g_assert_not_reached();
1044 }
1045 
1046 /*
1047  * Returns the result of adding or subtracting the floating-point
1048  * values `a' and `b'. The operation is performed according to the
1049  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050  */
1051 
1052 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1053 {
1054     FloatParts pa = float16_unpack_canonical(a, status);
1055     FloatParts pb = float16_unpack_canonical(b, status);
1056     FloatParts pr = addsub_floats(pa, pb, false, status);
1057 
1058     return float16_round_pack_canonical(pr, status);
1059 }
1060 
1061 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1062 {
1063     FloatParts pa = float16_unpack_canonical(a, status);
1064     FloatParts pb = float16_unpack_canonical(b, status);
1065     FloatParts pr = addsub_floats(pa, pb, true, status);
1066 
1067     return float16_round_pack_canonical(pr, status);
1068 }
1069 
1070 static float32 QEMU_SOFTFLOAT_ATTR
1071 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1072 {
1073     FloatParts pa = float32_unpack_canonical(a, status);
1074     FloatParts pb = float32_unpack_canonical(b, status);
1075     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1076 
1077     return float32_round_pack_canonical(pr, status);
1078 }
1079 
1080 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1081 {
1082     return soft_f32_addsub(a, b, false, status);
1083 }
1084 
1085 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1086 {
1087     return soft_f32_addsub(a, b, true, status);
1088 }
1089 
1090 static float64 QEMU_SOFTFLOAT_ATTR
1091 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1092 {
1093     FloatParts pa = float64_unpack_canonical(a, status);
1094     FloatParts pb = float64_unpack_canonical(b, status);
1095     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1096 
1097     return float64_round_pack_canonical(pr, status);
1098 }
1099 
1100 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1101 {
1102     return soft_f64_addsub(a, b, false, status);
1103 }
1104 
1105 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1106 {
1107     return soft_f64_addsub(a, b, true, status);
1108 }
1109 
1110 static float hard_f32_add(float a, float b)
1111 {
1112     return a + b;
1113 }
1114 
1115 static float hard_f32_sub(float a, float b)
1116 {
1117     return a - b;
1118 }
1119 
1120 static double hard_f64_add(double a, double b)
1121 {
1122     return a + b;
1123 }
1124 
1125 static double hard_f64_sub(double a, double b)
1126 {
1127     return a - b;
1128 }
1129 
1130 static bool f32_addsubmul_post(union_float32 a, union_float32 b)
1131 {
1132     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1133         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1134     }
1135     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1136 }
1137 
1138 static bool f64_addsubmul_post(union_float64 a, union_float64 b)
1139 {
1140     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1141         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1142     } else {
1143         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1144     }
1145 }
1146 
1147 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1148                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1149 {
1150     return float32_gen2(a, b, s, hard, soft,
1151                         f32_is_zon2, f32_addsubmul_post);
1152 }
1153 
1154 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1155                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1156 {
1157     return float64_gen2(a, b, s, hard, soft,
1158                         f64_is_zon2, f64_addsubmul_post);
1159 }
1160 
1161 float32 QEMU_FLATTEN
1162 float32_add(float32 a, float32 b, float_status *s)
1163 {
1164     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1165 }
1166 
1167 float32 QEMU_FLATTEN
1168 float32_sub(float32 a, float32 b, float_status *s)
1169 {
1170     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1171 }
1172 
1173 float64 QEMU_FLATTEN
1174 float64_add(float64 a, float64 b, float_status *s)
1175 {
1176     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1177 }
1178 
1179 float64 QEMU_FLATTEN
1180 float64_sub(float64 a, float64 b, float_status *s)
1181 {
1182     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1183 }
1184 
1185 /*
1186  * Returns the result of adding or subtracting the bfloat16
1187  * values `a' and `b'.
1188  */
1189 bfloat16 QEMU_FLATTEN bfloat16_add(bfloat16 a, bfloat16 b, float_status *status)
1190 {
1191     FloatParts pa = bfloat16_unpack_canonical(a, status);
1192     FloatParts pb = bfloat16_unpack_canonical(b, status);
1193     FloatParts pr = addsub_floats(pa, pb, false, status);
1194 
1195     return bfloat16_round_pack_canonical(pr, status);
1196 }
1197 
1198 bfloat16 QEMU_FLATTEN bfloat16_sub(bfloat16 a, bfloat16 b, float_status *status)
1199 {
1200     FloatParts pa = bfloat16_unpack_canonical(a, status);
1201     FloatParts pb = bfloat16_unpack_canonical(b, status);
1202     FloatParts pr = addsub_floats(pa, pb, true, status);
1203 
1204     return bfloat16_round_pack_canonical(pr, status);
1205 }
1206 
1207 /*
1208  * Returns the result of multiplying the floating-point values `a' and
1209  * `b'. The operation is performed according to the IEC/IEEE Standard
1210  * for Binary Floating-Point Arithmetic.
1211  */
1212 
1213 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1214 {
1215     bool sign = a.sign ^ b.sign;
1216 
1217     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1218         uint64_t hi, lo;
1219         int exp = a.exp + b.exp;
1220 
1221         mul64To128(a.frac, b.frac, &hi, &lo);
1222         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1223         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1224             shift64RightJamming(lo, 1, &lo);
1225             exp += 1;
1226         }
1227 
1228         /* Re-use a */
1229         a.exp = exp;
1230         a.sign = sign;
1231         a.frac = lo;
1232         return a;
1233     }
1234     /* handle all the NaN cases */
1235     if (is_nan(a.cls) || is_nan(b.cls)) {
1236         return pick_nan(a, b, s);
1237     }
1238     /* Inf * Zero == NaN */
1239     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1240         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1241         s->float_exception_flags |= float_flag_invalid;
1242         return parts_default_nan(s);
1243     }
1244     /* Multiply by 0 or Inf */
1245     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1246         a.sign = sign;
1247         return a;
1248     }
1249     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1250         b.sign = sign;
1251         return b;
1252     }
1253     g_assert_not_reached();
1254 }
1255 
1256 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1257 {
1258     FloatParts pa = float16_unpack_canonical(a, status);
1259     FloatParts pb = float16_unpack_canonical(b, status);
1260     FloatParts pr = mul_floats(pa, pb, status);
1261 
1262     return float16_round_pack_canonical(pr, status);
1263 }
1264 
1265 static float32 QEMU_SOFTFLOAT_ATTR
1266 soft_f32_mul(float32 a, float32 b, float_status *status)
1267 {
1268     FloatParts pa = float32_unpack_canonical(a, status);
1269     FloatParts pb = float32_unpack_canonical(b, status);
1270     FloatParts pr = mul_floats(pa, pb, status);
1271 
1272     return float32_round_pack_canonical(pr, status);
1273 }
1274 
1275 static float64 QEMU_SOFTFLOAT_ATTR
1276 soft_f64_mul(float64 a, float64 b, float_status *status)
1277 {
1278     FloatParts pa = float64_unpack_canonical(a, status);
1279     FloatParts pb = float64_unpack_canonical(b, status);
1280     FloatParts pr = mul_floats(pa, pb, status);
1281 
1282     return float64_round_pack_canonical(pr, status);
1283 }
1284 
1285 static float hard_f32_mul(float a, float b)
1286 {
1287     return a * b;
1288 }
1289 
1290 static double hard_f64_mul(double a, double b)
1291 {
1292     return a * b;
1293 }
1294 
1295 float32 QEMU_FLATTEN
1296 float32_mul(float32 a, float32 b, float_status *s)
1297 {
1298     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1299                         f32_is_zon2, f32_addsubmul_post);
1300 }
1301 
1302 float64 QEMU_FLATTEN
1303 float64_mul(float64 a, float64 b, float_status *s)
1304 {
1305     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1306                         f64_is_zon2, f64_addsubmul_post);
1307 }
1308 
1309 /*
1310  * Returns the result of multiplying the bfloat16
1311  * values `a' and `b'.
1312  */
1313 
1314 bfloat16 QEMU_FLATTEN bfloat16_mul(bfloat16 a, bfloat16 b, float_status *status)
1315 {
1316     FloatParts pa = bfloat16_unpack_canonical(a, status);
1317     FloatParts pb = bfloat16_unpack_canonical(b, status);
1318     FloatParts pr = mul_floats(pa, pb, status);
1319 
1320     return bfloat16_round_pack_canonical(pr, status);
1321 }
1322 
1323 /*
1324  * Returns the result of multiplying the floating-point values `a' and
1325  * `b' then adding 'c', with no intermediate rounding step after the
1326  * multiplication. The operation is performed according to the
1327  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1328  * The flags argument allows the caller to select negation of the
1329  * addend, the intermediate product, or the final result. (The
1330  * difference between this and having the caller do a separate
1331  * negation is that negating externally will flip the sign bit on
1332  * NaNs.)
1333  */
1334 
1335 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1336                                 int flags, float_status *s)
1337 {
1338     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1339                     ((1 << float_class_inf) | (1 << float_class_zero));
1340     bool p_sign;
1341     bool sign_flip = flags & float_muladd_negate_result;
1342     FloatClass p_class;
1343     uint64_t hi, lo;
1344     int p_exp;
1345 
1346     /* It is implementation-defined whether the cases of (0,inf,qnan)
1347      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1348      * they return if they do), so we have to hand this information
1349      * off to the target-specific pick-a-NaN routine.
1350      */
1351     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1352         return pick_nan_muladd(a, b, c, inf_zero, s);
1353     }
1354 
1355     if (inf_zero) {
1356         s->float_exception_flags |= float_flag_invalid;
1357         return parts_default_nan(s);
1358     }
1359 
1360     if (flags & float_muladd_negate_c) {
1361         c.sign ^= 1;
1362     }
1363 
1364     p_sign = a.sign ^ b.sign;
1365 
1366     if (flags & float_muladd_negate_product) {
1367         p_sign ^= 1;
1368     }
1369 
1370     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1371         p_class = float_class_inf;
1372     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1373         p_class = float_class_zero;
1374     } else {
1375         p_class = float_class_normal;
1376     }
1377 
1378     if (c.cls == float_class_inf) {
1379         if (p_class == float_class_inf && p_sign != c.sign) {
1380             s->float_exception_flags |= float_flag_invalid;
1381             return parts_default_nan(s);
1382         } else {
1383             a.cls = float_class_inf;
1384             a.sign = c.sign ^ sign_flip;
1385             return a;
1386         }
1387     }
1388 
1389     if (p_class == float_class_inf) {
1390         a.cls = float_class_inf;
1391         a.sign = p_sign ^ sign_flip;
1392         return a;
1393     }
1394 
1395     if (p_class == float_class_zero) {
1396         if (c.cls == float_class_zero) {
1397             if (p_sign != c.sign) {
1398                 p_sign = s->float_rounding_mode == float_round_down;
1399             }
1400             c.sign = p_sign;
1401         } else if (flags & float_muladd_halve_result) {
1402             c.exp -= 1;
1403         }
1404         c.sign ^= sign_flip;
1405         return c;
1406     }
1407 
1408     /* a & b should be normals now... */
1409     assert(a.cls == float_class_normal &&
1410            b.cls == float_class_normal);
1411 
1412     p_exp = a.exp + b.exp;
1413 
1414     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1415      * result.
1416      */
1417     mul64To128(a.frac, b.frac, &hi, &lo);
1418     /* binary point now at bit 124 */
1419 
1420     /* check for overflow */
1421     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1422         shift128RightJamming(hi, lo, 1, &hi, &lo);
1423         p_exp += 1;
1424     }
1425 
1426     /* + add/sub */
1427     if (c.cls == float_class_zero) {
1428         /* move binary point back to 62 */
1429         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1430     } else {
1431         int exp_diff = p_exp - c.exp;
1432         if (p_sign == c.sign) {
1433             /* Addition */
1434             if (exp_diff <= 0) {
1435                 shift128RightJamming(hi, lo,
1436                                      DECOMPOSED_BINARY_POINT - exp_diff,
1437                                      &hi, &lo);
1438                 lo += c.frac;
1439                 p_exp = c.exp;
1440             } else {
1441                 uint64_t c_hi, c_lo;
1442                 /* shift c to the same binary point as the product (124) */
1443                 c_hi = c.frac >> 2;
1444                 c_lo = 0;
1445                 shift128RightJamming(c_hi, c_lo,
1446                                      exp_diff,
1447                                      &c_hi, &c_lo);
1448                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1449                 /* move binary point back to 62 */
1450                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1451             }
1452 
1453             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1454                 shift64RightJamming(lo, 1, &lo);
1455                 p_exp += 1;
1456             }
1457 
1458         } else {
1459             /* Subtraction */
1460             uint64_t c_hi, c_lo;
1461             /* make C binary point match product at bit 124 */
1462             c_hi = c.frac >> 2;
1463             c_lo = 0;
1464 
1465             if (exp_diff <= 0) {
1466                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1467                 if (exp_diff == 0
1468                     &&
1469                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1470                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1471                 } else {
1472                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1473                     p_sign ^= 1;
1474                     p_exp = c.exp;
1475                 }
1476             } else {
1477                 shift128RightJamming(c_hi, c_lo,
1478                                      exp_diff,
1479                                      &c_hi, &c_lo);
1480                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1481             }
1482 
1483             if (hi == 0 && lo == 0) {
1484                 a.cls = float_class_zero;
1485                 a.sign = s->float_rounding_mode == float_round_down;
1486                 a.sign ^= sign_flip;
1487                 return a;
1488             } else {
1489                 int shift;
1490                 if (hi != 0) {
1491                     shift = clz64(hi);
1492                 } else {
1493                     shift = clz64(lo) + 64;
1494                 }
1495                 /* Normalizing to a binary point of 124 is the
1496                    correct adjust for the exponent.  However since we're
1497                    shifting, we might as well put the binary point back
1498                    at 62 where we really want it.  Therefore shift as
1499                    if we're leaving 1 bit at the top of the word, but
1500                    adjust the exponent as if we're leaving 3 bits.  */
1501                 shift -= 1;
1502                 if (shift >= 64) {
1503                     lo = lo << (shift - 64);
1504                 } else {
1505                     hi = (hi << shift) | (lo >> (64 - shift));
1506                     lo = hi | ((lo << shift) != 0);
1507                 }
1508                 p_exp -= shift - 2;
1509             }
1510         }
1511     }
1512 
1513     if (flags & float_muladd_halve_result) {
1514         p_exp -= 1;
1515     }
1516 
1517     /* finally prepare our result */
1518     a.cls = float_class_normal;
1519     a.sign = p_sign ^ sign_flip;
1520     a.exp = p_exp;
1521     a.frac = lo;
1522 
1523     return a;
1524 }
1525 
1526 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1527                                                 int flags, float_status *status)
1528 {
1529     FloatParts pa = float16_unpack_canonical(a, status);
1530     FloatParts pb = float16_unpack_canonical(b, status);
1531     FloatParts pc = float16_unpack_canonical(c, status);
1532     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533 
1534     return float16_round_pack_canonical(pr, status);
1535 }
1536 
1537 static float32 QEMU_SOFTFLOAT_ATTR
1538 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1539                 float_status *status)
1540 {
1541     FloatParts pa = float32_unpack_canonical(a, status);
1542     FloatParts pb = float32_unpack_canonical(b, status);
1543     FloatParts pc = float32_unpack_canonical(c, status);
1544     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1545 
1546     return float32_round_pack_canonical(pr, status);
1547 }
1548 
1549 static float64 QEMU_SOFTFLOAT_ATTR
1550 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1551                 float_status *status)
1552 {
1553     FloatParts pa = float64_unpack_canonical(a, status);
1554     FloatParts pb = float64_unpack_canonical(b, status);
1555     FloatParts pc = float64_unpack_canonical(c, status);
1556     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1557 
1558     return float64_round_pack_canonical(pr, status);
1559 }
1560 
1561 static bool force_soft_fma;
1562 
1563 float32 QEMU_FLATTEN
1564 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1565 {
1566     union_float32 ua, ub, uc, ur;
1567 
1568     ua.s = xa;
1569     ub.s = xb;
1570     uc.s = xc;
1571 
1572     if (unlikely(!can_use_fpu(s))) {
1573         goto soft;
1574     }
1575     if (unlikely(flags & float_muladd_halve_result)) {
1576         goto soft;
1577     }
1578 
1579     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1580     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1581         goto soft;
1582     }
1583 
1584     if (unlikely(force_soft_fma)) {
1585         goto soft;
1586     }
1587 
1588     /*
1589      * When (a || b) == 0, there's no need to check for under/over flow,
1590      * since we know the addend is (normal || 0) and the product is 0.
1591      */
1592     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1593         union_float32 up;
1594         bool prod_sign;
1595 
1596         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1597         prod_sign ^= !!(flags & float_muladd_negate_product);
1598         up.s = float32_set_sign(float32_zero, prod_sign);
1599 
1600         if (flags & float_muladd_negate_c) {
1601             uc.h = -uc.h;
1602         }
1603         ur.h = up.h + uc.h;
1604     } else {
1605         union_float32 ua_orig = ua;
1606         union_float32 uc_orig = uc;
1607 
1608         if (flags & float_muladd_negate_product) {
1609             ua.h = -ua.h;
1610         }
1611         if (flags & float_muladd_negate_c) {
1612             uc.h = -uc.h;
1613         }
1614 
1615         ur.h = fmaf(ua.h, ub.h, uc.h);
1616 
1617         if (unlikely(f32_is_inf(ur))) {
1618             s->float_exception_flags |= float_flag_overflow;
1619         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1620             ua = ua_orig;
1621             uc = uc_orig;
1622             goto soft;
1623         }
1624     }
1625     if (flags & float_muladd_negate_result) {
1626         return float32_chs(ur.s);
1627     }
1628     return ur.s;
1629 
1630  soft:
1631     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1632 }
1633 
1634 float64 QEMU_FLATTEN
1635 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1636 {
1637     union_float64 ua, ub, uc, ur;
1638 
1639     ua.s = xa;
1640     ub.s = xb;
1641     uc.s = xc;
1642 
1643     if (unlikely(!can_use_fpu(s))) {
1644         goto soft;
1645     }
1646     if (unlikely(flags & float_muladd_halve_result)) {
1647         goto soft;
1648     }
1649 
1650     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1651     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1652         goto soft;
1653     }
1654 
1655     if (unlikely(force_soft_fma)) {
1656         goto soft;
1657     }
1658 
1659     /*
1660      * When (a || b) == 0, there's no need to check for under/over flow,
1661      * since we know the addend is (normal || 0) and the product is 0.
1662      */
1663     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1664         union_float64 up;
1665         bool prod_sign;
1666 
1667         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1668         prod_sign ^= !!(flags & float_muladd_negate_product);
1669         up.s = float64_set_sign(float64_zero, prod_sign);
1670 
1671         if (flags & float_muladd_negate_c) {
1672             uc.h = -uc.h;
1673         }
1674         ur.h = up.h + uc.h;
1675     } else {
1676         union_float64 ua_orig = ua;
1677         union_float64 uc_orig = uc;
1678 
1679         if (flags & float_muladd_negate_product) {
1680             ua.h = -ua.h;
1681         }
1682         if (flags & float_muladd_negate_c) {
1683             uc.h = -uc.h;
1684         }
1685 
1686         ur.h = fma(ua.h, ub.h, uc.h);
1687 
1688         if (unlikely(f64_is_inf(ur))) {
1689             s->float_exception_flags |= float_flag_overflow;
1690         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1691             ua = ua_orig;
1692             uc = uc_orig;
1693             goto soft;
1694         }
1695     }
1696     if (flags & float_muladd_negate_result) {
1697         return float64_chs(ur.s);
1698     }
1699     return ur.s;
1700 
1701  soft:
1702     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1703 }
1704 
1705 /*
1706  * Returns the result of multiplying the bfloat16 values `a'
1707  * and `b' then adding 'c', with no intermediate rounding step after the
1708  * multiplication.
1709  */
1710 
1711 bfloat16 QEMU_FLATTEN bfloat16_muladd(bfloat16 a, bfloat16 b, bfloat16 c,
1712                                       int flags, float_status *status)
1713 {
1714     FloatParts pa = bfloat16_unpack_canonical(a, status);
1715     FloatParts pb = bfloat16_unpack_canonical(b, status);
1716     FloatParts pc = bfloat16_unpack_canonical(c, status);
1717     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1718 
1719     return bfloat16_round_pack_canonical(pr, status);
1720 }
1721 
1722 /*
1723  * Returns the result of dividing the floating-point value `a' by the
1724  * corresponding value `b'. The operation is performed according to
1725  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1726  */
1727 
1728 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1729 {
1730     bool sign = a.sign ^ b.sign;
1731 
1732     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1733         uint64_t n0, n1, q, r;
1734         int exp = a.exp - b.exp;
1735 
1736         /*
1737          * We want a 2*N / N-bit division to produce exactly an N-bit
1738          * result, so that we do not lose any precision and so that we
1739          * do not have to renormalize afterward.  If A.frac < B.frac,
1740          * then division would produce an (N-1)-bit result; shift A left
1741          * by one to produce the an N-bit result, and decrement the
1742          * exponent to match.
1743          *
1744          * The udiv_qrnnd algorithm that we're using requires normalization,
1745          * i.e. the msb of the denominator must be set.  Since we know that
1746          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1747          * by one (more), and the remainder must be shifted right by one.
1748          */
1749         if (a.frac < b.frac) {
1750             exp -= 1;
1751             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1752         } else {
1753             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1754         }
1755         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1756 
1757         /*
1758          * Set lsb if there is a remainder, to set inexact.
1759          * As mentioned above, to find the actual value of the remainder we
1760          * would need to shift right, but (1) we are only concerned about
1761          * non-zero-ness, and (2) the remainder will always be even because
1762          * both inputs to the division primitive are even.
1763          */
1764         a.frac = q | (r != 0);
1765         a.sign = sign;
1766         a.exp = exp;
1767         return a;
1768     }
1769     /* handle all the NaN cases */
1770     if (is_nan(a.cls) || is_nan(b.cls)) {
1771         return pick_nan(a, b, s);
1772     }
1773     /* 0/0 or Inf/Inf */
1774     if (a.cls == b.cls
1775         &&
1776         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1777         s->float_exception_flags |= float_flag_invalid;
1778         return parts_default_nan(s);
1779     }
1780     /* Inf / x or 0 / x */
1781     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1782         a.sign = sign;
1783         return a;
1784     }
1785     /* Div 0 => Inf */
1786     if (b.cls == float_class_zero) {
1787         s->float_exception_flags |= float_flag_divbyzero;
1788         a.cls = float_class_inf;
1789         a.sign = sign;
1790         return a;
1791     }
1792     /* Div by Inf */
1793     if (b.cls == float_class_inf) {
1794         a.cls = float_class_zero;
1795         a.sign = sign;
1796         return a;
1797     }
1798     g_assert_not_reached();
1799 }
1800 
1801 float16 float16_div(float16 a, float16 b, float_status *status)
1802 {
1803     FloatParts pa = float16_unpack_canonical(a, status);
1804     FloatParts pb = float16_unpack_canonical(b, status);
1805     FloatParts pr = div_floats(pa, pb, status);
1806 
1807     return float16_round_pack_canonical(pr, status);
1808 }
1809 
1810 static float32 QEMU_SOFTFLOAT_ATTR
1811 soft_f32_div(float32 a, float32 b, float_status *status)
1812 {
1813     FloatParts pa = float32_unpack_canonical(a, status);
1814     FloatParts pb = float32_unpack_canonical(b, status);
1815     FloatParts pr = div_floats(pa, pb, status);
1816 
1817     return float32_round_pack_canonical(pr, status);
1818 }
1819 
1820 static float64 QEMU_SOFTFLOAT_ATTR
1821 soft_f64_div(float64 a, float64 b, float_status *status)
1822 {
1823     FloatParts pa = float64_unpack_canonical(a, status);
1824     FloatParts pb = float64_unpack_canonical(b, status);
1825     FloatParts pr = div_floats(pa, pb, status);
1826 
1827     return float64_round_pack_canonical(pr, status);
1828 }
1829 
1830 static float hard_f32_div(float a, float b)
1831 {
1832     return a / b;
1833 }
1834 
1835 static double hard_f64_div(double a, double b)
1836 {
1837     return a / b;
1838 }
1839 
1840 static bool f32_div_pre(union_float32 a, union_float32 b)
1841 {
1842     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1843         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1844                fpclassify(b.h) == FP_NORMAL;
1845     }
1846     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1847 }
1848 
1849 static bool f64_div_pre(union_float64 a, union_float64 b)
1850 {
1851     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1852         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1853                fpclassify(b.h) == FP_NORMAL;
1854     }
1855     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1856 }
1857 
1858 static bool f32_div_post(union_float32 a, union_float32 b)
1859 {
1860     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1861         return fpclassify(a.h) != FP_ZERO;
1862     }
1863     return !float32_is_zero(a.s);
1864 }
1865 
1866 static bool f64_div_post(union_float64 a, union_float64 b)
1867 {
1868     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1869         return fpclassify(a.h) != FP_ZERO;
1870     }
1871     return !float64_is_zero(a.s);
1872 }
1873 
1874 float32 QEMU_FLATTEN
1875 float32_div(float32 a, float32 b, float_status *s)
1876 {
1877     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1878                         f32_div_pre, f32_div_post);
1879 }
1880 
1881 float64 QEMU_FLATTEN
1882 float64_div(float64 a, float64 b, float_status *s)
1883 {
1884     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1885                         f64_div_pre, f64_div_post);
1886 }
1887 
1888 /*
1889  * Returns the result of dividing the bfloat16
1890  * value `a' by the corresponding value `b'.
1891  */
1892 
1893 bfloat16 bfloat16_div(bfloat16 a, bfloat16 b, float_status *status)
1894 {
1895     FloatParts pa = bfloat16_unpack_canonical(a, status);
1896     FloatParts pb = bfloat16_unpack_canonical(b, status);
1897     FloatParts pr = div_floats(pa, pb, status);
1898 
1899     return bfloat16_round_pack_canonical(pr, status);
1900 }
1901 
1902 /*
1903  * Float to Float conversions
1904  *
1905  * Returns the result of converting one float format to another. The
1906  * conversion is performed according to the IEC/IEEE Standard for
1907  * Binary Floating-Point Arithmetic.
1908  *
1909  * The float_to_float helper only needs to take care of raising
1910  * invalid exceptions and handling the conversion on NaNs.
1911  */
1912 
1913 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1914                                  float_status *s)
1915 {
1916     if (dstf->arm_althp) {
1917         switch (a.cls) {
1918         case float_class_qnan:
1919         case float_class_snan:
1920             /* There is no NaN in the destination format.  Raise Invalid
1921              * and return a zero with the sign of the input NaN.
1922              */
1923             s->float_exception_flags |= float_flag_invalid;
1924             a.cls = float_class_zero;
1925             a.frac = 0;
1926             a.exp = 0;
1927             break;
1928 
1929         case float_class_inf:
1930             /* There is no Inf in the destination format.  Raise Invalid
1931              * and return the maximum normal with the correct sign.
1932              */
1933             s->float_exception_flags |= float_flag_invalid;
1934             a.cls = float_class_normal;
1935             a.exp = dstf->exp_max;
1936             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1937             break;
1938 
1939         default:
1940             break;
1941         }
1942     } else if (is_nan(a.cls)) {
1943         if (is_snan(a.cls)) {
1944             s->float_exception_flags |= float_flag_invalid;
1945             a = parts_silence_nan(a, s);
1946         }
1947         if (s->default_nan_mode) {
1948             return parts_default_nan(s);
1949         }
1950     }
1951     return a;
1952 }
1953 
1954 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1955 {
1956     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1957     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1958     FloatParts pr = float_to_float(p, &float32_params, s);
1959     return float32_round_pack_canonical(pr, s);
1960 }
1961 
1962 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1963 {
1964     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1965     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1966     FloatParts pr = float_to_float(p, &float64_params, s);
1967     return float64_round_pack_canonical(pr, s);
1968 }
1969 
1970 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1971 {
1972     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1973     FloatParts p = float32_unpack_canonical(a, s);
1974     FloatParts pr = float_to_float(p, fmt16, s);
1975     return float16a_round_pack_canonical(pr, s, fmt16);
1976 }
1977 
1978 static float64 QEMU_SOFTFLOAT_ATTR
1979 soft_float32_to_float64(float32 a, float_status *s)
1980 {
1981     FloatParts p = float32_unpack_canonical(a, s);
1982     FloatParts pr = float_to_float(p, &float64_params, s);
1983     return float64_round_pack_canonical(pr, s);
1984 }
1985 
1986 float64 float32_to_float64(float32 a, float_status *s)
1987 {
1988     if (likely(float32_is_normal(a))) {
1989         /* Widening conversion can never produce inexact results.  */
1990         union_float32 uf;
1991         union_float64 ud;
1992         uf.s = a;
1993         ud.h = uf.h;
1994         return ud.s;
1995     } else if (float32_is_zero(a)) {
1996         return float64_set_sign(float64_zero, float32_is_neg(a));
1997     } else {
1998         return soft_float32_to_float64(a, s);
1999     }
2000 }
2001 
2002 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
2003 {
2004     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
2005     FloatParts p = float64_unpack_canonical(a, s);
2006     FloatParts pr = float_to_float(p, fmt16, s);
2007     return float16a_round_pack_canonical(pr, s, fmt16);
2008 }
2009 
2010 float32 float64_to_float32(float64 a, float_status *s)
2011 {
2012     FloatParts p = float64_unpack_canonical(a, s);
2013     FloatParts pr = float_to_float(p, &float32_params, s);
2014     return float32_round_pack_canonical(pr, s);
2015 }
2016 
2017 float32 bfloat16_to_float32(bfloat16 a, float_status *s)
2018 {
2019     FloatParts p = bfloat16_unpack_canonical(a, s);
2020     FloatParts pr = float_to_float(p, &float32_params, s);
2021     return float32_round_pack_canonical(pr, s);
2022 }
2023 
2024 float64 bfloat16_to_float64(bfloat16 a, float_status *s)
2025 {
2026     FloatParts p = bfloat16_unpack_canonical(a, s);
2027     FloatParts pr = float_to_float(p, &float64_params, s);
2028     return float64_round_pack_canonical(pr, s);
2029 }
2030 
2031 bfloat16 float32_to_bfloat16(float32 a, float_status *s)
2032 {
2033     FloatParts p = float32_unpack_canonical(a, s);
2034     FloatParts pr = float_to_float(p, &bfloat16_params, s);
2035     return bfloat16_round_pack_canonical(pr, s);
2036 }
2037 
2038 bfloat16 float64_to_bfloat16(float64 a, float_status *s)
2039 {
2040     FloatParts p = float64_unpack_canonical(a, s);
2041     FloatParts pr = float_to_float(p, &bfloat16_params, s);
2042     return bfloat16_round_pack_canonical(pr, s);
2043 }
2044 
2045 /*
2046  * Rounds the floating-point value `a' to an integer, and returns the
2047  * result as a floating-point value. The operation is performed
2048  * according to the IEC/IEEE Standard for Binary Floating-Point
2049  * Arithmetic.
2050  */
2051 
2052 static FloatParts round_to_int(FloatParts a, FloatRoundMode rmode,
2053                                int scale, float_status *s)
2054 {
2055     switch (a.cls) {
2056     case float_class_qnan:
2057     case float_class_snan:
2058         return return_nan(a, s);
2059 
2060     case float_class_zero:
2061     case float_class_inf:
2062         /* already "integral" */
2063         break;
2064 
2065     case float_class_normal:
2066         scale = MIN(MAX(scale, -0x10000), 0x10000);
2067         a.exp += scale;
2068 
2069         if (a.exp >= DECOMPOSED_BINARY_POINT) {
2070             /* already integral */
2071             break;
2072         }
2073         if (a.exp < 0) {
2074             bool one;
2075             /* all fractional */
2076             s->float_exception_flags |= float_flag_inexact;
2077             switch (rmode) {
2078             case float_round_nearest_even:
2079                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
2080                 break;
2081             case float_round_ties_away:
2082                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2083                 break;
2084             case float_round_to_zero:
2085                 one = false;
2086                 break;
2087             case float_round_up:
2088                 one = !a.sign;
2089                 break;
2090             case float_round_down:
2091                 one = a.sign;
2092                 break;
2093             case float_round_to_odd:
2094                 one = true;
2095                 break;
2096             default:
2097                 g_assert_not_reached();
2098             }
2099 
2100             if (one) {
2101                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2102                 a.exp = 0;
2103             } else {
2104                 a.cls = float_class_zero;
2105             }
2106         } else {
2107             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2108             uint64_t frac_lsbm1 = frac_lsb >> 1;
2109             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2110             uint64_t rnd_mask = rnd_even_mask >> 1;
2111             uint64_t inc;
2112 
2113             switch (rmode) {
2114             case float_round_nearest_even:
2115                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2116                 break;
2117             case float_round_ties_away:
2118                 inc = frac_lsbm1;
2119                 break;
2120             case float_round_to_zero:
2121                 inc = 0;
2122                 break;
2123             case float_round_up:
2124                 inc = a.sign ? 0 : rnd_mask;
2125                 break;
2126             case float_round_down:
2127                 inc = a.sign ? rnd_mask : 0;
2128                 break;
2129             case float_round_to_odd:
2130                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2131                 break;
2132             default:
2133                 g_assert_not_reached();
2134             }
2135 
2136             if (a.frac & rnd_mask) {
2137                 s->float_exception_flags |= float_flag_inexact;
2138                 a.frac += inc;
2139                 a.frac &= ~rnd_mask;
2140                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2141                     a.frac >>= 1;
2142                     a.exp++;
2143                 }
2144             }
2145         }
2146         break;
2147     default:
2148         g_assert_not_reached();
2149     }
2150     return a;
2151 }
2152 
2153 float16 float16_round_to_int(float16 a, float_status *s)
2154 {
2155     FloatParts pa = float16_unpack_canonical(a, s);
2156     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2157     return float16_round_pack_canonical(pr, s);
2158 }
2159 
2160 float32 float32_round_to_int(float32 a, float_status *s)
2161 {
2162     FloatParts pa = float32_unpack_canonical(a, s);
2163     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2164     return float32_round_pack_canonical(pr, s);
2165 }
2166 
2167 float64 float64_round_to_int(float64 a, float_status *s)
2168 {
2169     FloatParts pa = float64_unpack_canonical(a, s);
2170     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2171     return float64_round_pack_canonical(pr, s);
2172 }
2173 
2174 /*
2175  * Rounds the bfloat16 value `a' to an integer, and returns the
2176  * result as a bfloat16 value.
2177  */
2178 
2179 bfloat16 bfloat16_round_to_int(bfloat16 a, float_status *s)
2180 {
2181     FloatParts pa = bfloat16_unpack_canonical(a, s);
2182     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2183     return bfloat16_round_pack_canonical(pr, s);
2184 }
2185 
2186 /*
2187  * Returns the result of converting the floating-point value `a' to
2188  * the two's complement integer format. The conversion is performed
2189  * according to the IEC/IEEE Standard for Binary Floating-Point
2190  * Arithmetic---which means in particular that the conversion is
2191  * rounded according to the current rounding mode. If `a' is a NaN,
2192  * the largest positive integer is returned. Otherwise, if the
2193  * conversion overflows, the largest integer with the same sign as `a'
2194  * is returned.
2195 */
2196 
2197 static int64_t round_to_int_and_pack(FloatParts in, FloatRoundMode rmode,
2198                                      int scale, int64_t min, int64_t max,
2199                                      float_status *s)
2200 {
2201     uint64_t r;
2202     int orig_flags = get_float_exception_flags(s);
2203     FloatParts p = round_to_int(in, rmode, scale, s);
2204 
2205     switch (p.cls) {
2206     case float_class_snan:
2207     case float_class_qnan:
2208         s->float_exception_flags = orig_flags | float_flag_invalid;
2209         return max;
2210     case float_class_inf:
2211         s->float_exception_flags = orig_flags | float_flag_invalid;
2212         return p.sign ? min : max;
2213     case float_class_zero:
2214         return 0;
2215     case float_class_normal:
2216         if (p.exp < DECOMPOSED_BINARY_POINT) {
2217             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2218         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2219             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2220         } else {
2221             r = UINT64_MAX;
2222         }
2223         if (p.sign) {
2224             if (r <= -(uint64_t) min) {
2225                 return -r;
2226             } else {
2227                 s->float_exception_flags = orig_flags | float_flag_invalid;
2228                 return min;
2229             }
2230         } else {
2231             if (r <= max) {
2232                 return r;
2233             } else {
2234                 s->float_exception_flags = orig_flags | float_flag_invalid;
2235                 return max;
2236             }
2237         }
2238     default:
2239         g_assert_not_reached();
2240     }
2241 }
2242 
2243 int8_t float16_to_int8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2244                               float_status *s)
2245 {
2246     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2247                                  rmode, scale, INT8_MIN, INT8_MAX, s);
2248 }
2249 
2250 int16_t float16_to_int16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2251                                 float_status *s)
2252 {
2253     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2254                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2255 }
2256 
2257 int32_t float16_to_int32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2258                                 float_status *s)
2259 {
2260     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2261                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2262 }
2263 
2264 int64_t float16_to_int64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2265                                 float_status *s)
2266 {
2267     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2268                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2269 }
2270 
2271 int16_t float32_to_int16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2272                                 float_status *s)
2273 {
2274     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2275                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2276 }
2277 
2278 int32_t float32_to_int32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2279                                 float_status *s)
2280 {
2281     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2282                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2283 }
2284 
2285 int64_t float32_to_int64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2286                                 float_status *s)
2287 {
2288     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2289                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2290 }
2291 
2292 int16_t float64_to_int16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2293                                 float_status *s)
2294 {
2295     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2296                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2297 }
2298 
2299 int32_t float64_to_int32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2300                                 float_status *s)
2301 {
2302     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2303                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2304 }
2305 
2306 int64_t float64_to_int64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2307                                 float_status *s)
2308 {
2309     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2310                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2311 }
2312 
2313 int8_t float16_to_int8(float16 a, float_status *s)
2314 {
2315     return float16_to_int8_scalbn(a, s->float_rounding_mode, 0, s);
2316 }
2317 
2318 int16_t float16_to_int16(float16 a, float_status *s)
2319 {
2320     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2321 }
2322 
2323 int32_t float16_to_int32(float16 a, float_status *s)
2324 {
2325     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2326 }
2327 
2328 int64_t float16_to_int64(float16 a, float_status *s)
2329 {
2330     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2331 }
2332 
2333 int16_t float32_to_int16(float32 a, float_status *s)
2334 {
2335     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2336 }
2337 
2338 int32_t float32_to_int32(float32 a, float_status *s)
2339 {
2340     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2341 }
2342 
2343 int64_t float32_to_int64(float32 a, float_status *s)
2344 {
2345     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2346 }
2347 
2348 int16_t float64_to_int16(float64 a, float_status *s)
2349 {
2350     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2351 }
2352 
2353 int32_t float64_to_int32(float64 a, float_status *s)
2354 {
2355     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2356 }
2357 
2358 int64_t float64_to_int64(float64 a, float_status *s)
2359 {
2360     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2361 }
2362 
2363 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2364 {
2365     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2366 }
2367 
2368 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2369 {
2370     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2371 }
2372 
2373 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2374 {
2375     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2376 }
2377 
2378 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2379 {
2380     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2381 }
2382 
2383 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2384 {
2385     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2386 }
2387 
2388 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2389 {
2390     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2391 }
2392 
2393 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2394 {
2395     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2396 }
2397 
2398 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2399 {
2400     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2401 }
2402 
2403 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2404 {
2405     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2406 }
2407 
2408 /*
2409  * Returns the result of converting the floating-point value `a' to
2410  * the two's complement integer format.
2411  */
2412 
2413 int16_t bfloat16_to_int16_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2414                                  float_status *s)
2415 {
2416     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2417                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2418 }
2419 
2420 int32_t bfloat16_to_int32_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2421                                  float_status *s)
2422 {
2423     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2424                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2425 }
2426 
2427 int64_t bfloat16_to_int64_scalbn(bfloat16 a, FloatRoundMode rmode, int scale,
2428                                  float_status *s)
2429 {
2430     return round_to_int_and_pack(bfloat16_unpack_canonical(a, s),
2431                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2432 }
2433 
2434 int16_t bfloat16_to_int16(bfloat16 a, float_status *s)
2435 {
2436     return bfloat16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2437 }
2438 
2439 int32_t bfloat16_to_int32(bfloat16 a, float_status *s)
2440 {
2441     return bfloat16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2442 }
2443 
2444 int64_t bfloat16_to_int64(bfloat16 a, float_status *s)
2445 {
2446     return bfloat16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2447 }
2448 
2449 int16_t bfloat16_to_int16_round_to_zero(bfloat16 a, float_status *s)
2450 {
2451     return bfloat16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2452 }
2453 
2454 int32_t bfloat16_to_int32_round_to_zero(bfloat16 a, float_status *s)
2455 {
2456     return bfloat16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2457 }
2458 
2459 int64_t bfloat16_to_int64_round_to_zero(bfloat16 a, float_status *s)
2460 {
2461     return bfloat16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2462 }
2463 
2464 /*
2465  *  Returns the result of converting the floating-point value `a' to
2466  *  the unsigned integer format. The conversion is performed according
2467  *  to the IEC/IEEE Standard for Binary Floating-Point
2468  *  Arithmetic---which means in particular that the conversion is
2469  *  rounded according to the current rounding mode. If `a' is a NaN,
2470  *  the largest unsigned integer is returned. Otherwise, if the
2471  *  conversion overflows, the largest unsigned integer is returned. If
2472  *  the 'a' is negative, the result is rounded and zero is returned;
2473  *  values that do not round to zero will raise the inexact exception
2474  *  flag.
2475  */
2476 
2477 static uint64_t round_to_uint_and_pack(FloatParts in, FloatRoundMode rmode,
2478                                        int scale, uint64_t max,
2479                                        float_status *s)
2480 {
2481     int orig_flags = get_float_exception_flags(s);
2482     FloatParts p = round_to_int(in, rmode, scale, s);
2483     uint64_t r;
2484 
2485     switch (p.cls) {
2486     case float_class_snan:
2487     case float_class_qnan:
2488         s->float_exception_flags = orig_flags | float_flag_invalid;
2489         return max;
2490     case float_class_inf:
2491         s->float_exception_flags = orig_flags | float_flag_invalid;
2492         return p.sign ? 0 : max;
2493     case float_class_zero:
2494         return 0;
2495     case float_class_normal:
2496         if (p.sign) {
2497             s->float_exception_flags = orig_flags | float_flag_invalid;
2498             return 0;
2499         }
2500 
2501         if (p.exp < DECOMPOSED_BINARY_POINT) {
2502             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2503         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2504             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2505         } else {
2506             s->float_exception_flags = orig_flags | float_flag_invalid;
2507             return max;
2508         }
2509 
2510         /* For uint64 this will never trip, but if p.exp is too large
2511          * to shift a decomposed fraction we shall have exited via the
2512          * 3rd leg above.
2513          */
2514         if (r > max) {
2515             s->float_exception_flags = orig_flags | float_flag_invalid;
2516             return max;
2517         }
2518         return r;
2519     default:
2520         g_assert_not_reached();
2521     }
2522 }
2523 
2524 uint8_t float16_to_uint8_scalbn(float16 a, FloatRoundMode rmode, int scale,
2525                                 float_status *s)
2526 {
2527     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2528                                   rmode, scale, UINT8_MAX, s);
2529 }
2530 
2531 uint16_t float16_to_uint16_scalbn(float16 a, FloatRoundMode rmode, int scale,
2532                                   float_status *s)
2533 {
2534     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2535                                   rmode, scale, UINT16_MAX, s);
2536 }
2537 
2538 uint32_t float16_to_uint32_scalbn(float16 a, FloatRoundMode rmode, int scale,
2539                                   float_status *s)
2540 {
2541     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2542                                   rmode, scale, UINT32_MAX, s);
2543 }
2544 
2545 uint64_t float16_to_uint64_scalbn(float16 a, FloatRoundMode rmode, int scale,
2546                                   float_status *s)
2547 {
2548     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2549                                   rmode, scale, UINT64_MAX, s);
2550 }
2551 
2552 uint16_t float32_to_uint16_scalbn(float32 a, FloatRoundMode rmode, int scale,
2553                                   float_status *s)
2554 {
2555     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2556                                   rmode, scale, UINT16_MAX, s);
2557 }
2558 
2559 uint32_t float32_to_uint32_scalbn(float32 a, FloatRoundMode rmode, int scale,
2560                                   float_status *s)
2561 {
2562     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2563                                   rmode, scale, UINT32_MAX, s);
2564 }
2565 
2566 uint64_t float32_to_uint64_scalbn(float32 a, FloatRoundMode rmode, int scale,
2567                                   float_status *s)
2568 {
2569     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2570                                   rmode, scale, UINT64_MAX, s);
2571 }
2572 
2573 uint16_t float64_to_uint16_scalbn(float64 a, FloatRoundMode rmode, int scale,
2574                                   float_status *s)
2575 {
2576     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2577                                   rmode, scale, UINT16_MAX, s);
2578 }
2579 
2580 uint32_t float64_to_uint32_scalbn(float64 a, FloatRoundMode rmode, int scale,
2581                                   float_status *s)
2582 {
2583     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2584                                   rmode, scale, UINT32_MAX, s);
2585 }
2586 
2587 uint64_t float64_to_uint64_scalbn(float64 a, FloatRoundMode rmode, int scale,
2588                                   float_status *s)
2589 {
2590     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2591                                   rmode, scale, UINT64_MAX, s);
2592 }
2593 
2594 uint8_t float16_to_uint8(float16 a, float_status *s)
2595 {
2596     return float16_to_uint8_scalbn(a, s->float_rounding_mode, 0, s);
2597 }
2598 
2599 uint16_t float16_to_uint16(float16 a, float_status *s)
2600 {
2601     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2602 }
2603 
2604 uint32_t float16_to_uint32(float16 a, float_status *s)
2605 {
2606     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2607 }
2608 
2609 uint64_t float16_to_uint64(float16 a, float_status *s)
2610 {
2611     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2612 }
2613 
2614 uint16_t float32_to_uint16(float32 a, float_status *s)
2615 {
2616     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2617 }
2618 
2619 uint32_t float32_to_uint32(float32 a, float_status *s)
2620 {
2621     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2622 }
2623 
2624 uint64_t float32_to_uint64(float32 a, float_status *s)
2625 {
2626     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2627 }
2628 
2629 uint16_t float64_to_uint16(float64 a, float_status *s)
2630 {
2631     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2632 }
2633 
2634 uint32_t float64_to_uint32(float64 a, float_status *s)
2635 {
2636     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2637 }
2638 
2639 uint64_t float64_to_uint64(float64 a, float_status *s)
2640 {
2641     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2642 }
2643 
2644 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2645 {
2646     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2647 }
2648 
2649 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2650 {
2651     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2652 }
2653 
2654 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2655 {
2656     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2657 }
2658 
2659 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2660 {
2661     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2662 }
2663 
2664 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2665 {
2666     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2667 }
2668 
2669 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2670 {
2671     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2672 }
2673 
2674 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2675 {
2676     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2677 }
2678 
2679 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2680 {
2681     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2682 }
2683 
2684 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2685 {
2686     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2687 }
2688 
2689 /*
2690  *  Returns the result of converting the bfloat16 value `a' to
2691  *  the unsigned integer format.
2692  */
2693 
2694 uint16_t bfloat16_to_uint16_scalbn(bfloat16 a, FloatRoundMode rmode,
2695                                    int scale, float_status *s)
2696 {
2697     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2698                                   rmode, scale, UINT16_MAX, s);
2699 }
2700 
2701 uint32_t bfloat16_to_uint32_scalbn(bfloat16 a, FloatRoundMode rmode,
2702                                    int scale, float_status *s)
2703 {
2704     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2705                                   rmode, scale, UINT32_MAX, s);
2706 }
2707 
2708 uint64_t bfloat16_to_uint64_scalbn(bfloat16 a, FloatRoundMode rmode,
2709                                    int scale, float_status *s)
2710 {
2711     return round_to_uint_and_pack(bfloat16_unpack_canonical(a, s),
2712                                   rmode, scale, UINT64_MAX, s);
2713 }
2714 
2715 uint16_t bfloat16_to_uint16(bfloat16 a, float_status *s)
2716 {
2717     return bfloat16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2718 }
2719 
2720 uint32_t bfloat16_to_uint32(bfloat16 a, float_status *s)
2721 {
2722     return bfloat16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2723 }
2724 
2725 uint64_t bfloat16_to_uint64(bfloat16 a, float_status *s)
2726 {
2727     return bfloat16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2728 }
2729 
2730 uint16_t bfloat16_to_uint16_round_to_zero(bfloat16 a, float_status *s)
2731 {
2732     return bfloat16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2733 }
2734 
2735 uint32_t bfloat16_to_uint32_round_to_zero(bfloat16 a, float_status *s)
2736 {
2737     return bfloat16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2738 }
2739 
2740 uint64_t bfloat16_to_uint64_round_to_zero(bfloat16 a, float_status *s)
2741 {
2742     return bfloat16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2743 }
2744 
2745 /*
2746  * Integer to float conversions
2747  *
2748  * Returns the result of converting the two's complement integer `a'
2749  * to the floating-point format. The conversion is performed according
2750  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2751  */
2752 
2753 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2754 {
2755     FloatParts r = { .sign = false };
2756 
2757     if (a == 0) {
2758         r.cls = float_class_zero;
2759     } else {
2760         uint64_t f = a;
2761         int shift;
2762 
2763         r.cls = float_class_normal;
2764         if (a < 0) {
2765             f = -f;
2766             r.sign = true;
2767         }
2768         shift = clz64(f) - 1;
2769         scale = MIN(MAX(scale, -0x10000), 0x10000);
2770 
2771         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2772         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2773     }
2774 
2775     return r;
2776 }
2777 
2778 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2779 {
2780     FloatParts pa = int_to_float(a, scale, status);
2781     return float16_round_pack_canonical(pa, status);
2782 }
2783 
2784 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2785 {
2786     return int64_to_float16_scalbn(a, scale, status);
2787 }
2788 
2789 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2790 {
2791     return int64_to_float16_scalbn(a, scale, status);
2792 }
2793 
2794 float16 int64_to_float16(int64_t a, float_status *status)
2795 {
2796     return int64_to_float16_scalbn(a, 0, status);
2797 }
2798 
2799 float16 int32_to_float16(int32_t a, float_status *status)
2800 {
2801     return int64_to_float16_scalbn(a, 0, status);
2802 }
2803 
2804 float16 int16_to_float16(int16_t a, float_status *status)
2805 {
2806     return int64_to_float16_scalbn(a, 0, status);
2807 }
2808 
2809 float16 int8_to_float16(int8_t a, float_status *status)
2810 {
2811     return int64_to_float16_scalbn(a, 0, status);
2812 }
2813 
2814 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2815 {
2816     FloatParts pa = int_to_float(a, scale, status);
2817     return float32_round_pack_canonical(pa, status);
2818 }
2819 
2820 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2821 {
2822     return int64_to_float32_scalbn(a, scale, status);
2823 }
2824 
2825 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2826 {
2827     return int64_to_float32_scalbn(a, scale, status);
2828 }
2829 
2830 float32 int64_to_float32(int64_t a, float_status *status)
2831 {
2832     return int64_to_float32_scalbn(a, 0, status);
2833 }
2834 
2835 float32 int32_to_float32(int32_t a, float_status *status)
2836 {
2837     return int64_to_float32_scalbn(a, 0, status);
2838 }
2839 
2840 float32 int16_to_float32(int16_t a, float_status *status)
2841 {
2842     return int64_to_float32_scalbn(a, 0, status);
2843 }
2844 
2845 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2846 {
2847     FloatParts pa = int_to_float(a, scale, status);
2848     return float64_round_pack_canonical(pa, status);
2849 }
2850 
2851 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2852 {
2853     return int64_to_float64_scalbn(a, scale, status);
2854 }
2855 
2856 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2857 {
2858     return int64_to_float64_scalbn(a, scale, status);
2859 }
2860 
2861 float64 int64_to_float64(int64_t a, float_status *status)
2862 {
2863     return int64_to_float64_scalbn(a, 0, status);
2864 }
2865 
2866 float64 int32_to_float64(int32_t a, float_status *status)
2867 {
2868     return int64_to_float64_scalbn(a, 0, status);
2869 }
2870 
2871 float64 int16_to_float64(int16_t a, float_status *status)
2872 {
2873     return int64_to_float64_scalbn(a, 0, status);
2874 }
2875 
2876 /*
2877  * Returns the result of converting the two's complement integer `a'
2878  * to the bfloat16 format.
2879  */
2880 
2881 bfloat16 int64_to_bfloat16_scalbn(int64_t a, int scale, float_status *status)
2882 {
2883     FloatParts pa = int_to_float(a, scale, status);
2884     return bfloat16_round_pack_canonical(pa, status);
2885 }
2886 
2887 bfloat16 int32_to_bfloat16_scalbn(int32_t a, int scale, float_status *status)
2888 {
2889     return int64_to_bfloat16_scalbn(a, scale, status);
2890 }
2891 
2892 bfloat16 int16_to_bfloat16_scalbn(int16_t a, int scale, float_status *status)
2893 {
2894     return int64_to_bfloat16_scalbn(a, scale, status);
2895 }
2896 
2897 bfloat16 int64_to_bfloat16(int64_t a, float_status *status)
2898 {
2899     return int64_to_bfloat16_scalbn(a, 0, status);
2900 }
2901 
2902 bfloat16 int32_to_bfloat16(int32_t a, float_status *status)
2903 {
2904     return int64_to_bfloat16_scalbn(a, 0, status);
2905 }
2906 
2907 bfloat16 int16_to_bfloat16(int16_t a, float_status *status)
2908 {
2909     return int64_to_bfloat16_scalbn(a, 0, status);
2910 }
2911 
2912 /*
2913  * Unsigned Integer to float conversions
2914  *
2915  * Returns the result of converting the unsigned integer `a' to the
2916  * floating-point format. The conversion is performed according to the
2917  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2918  */
2919 
2920 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2921 {
2922     FloatParts r = { .sign = false };
2923 
2924     if (a == 0) {
2925         r.cls = float_class_zero;
2926     } else {
2927         scale = MIN(MAX(scale, -0x10000), 0x10000);
2928         r.cls = float_class_normal;
2929         if ((int64_t)a < 0) {
2930             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2931             shift64RightJamming(a, 1, &a);
2932             r.frac = a;
2933         } else {
2934             int shift = clz64(a) - 1;
2935             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2936             r.frac = a << shift;
2937         }
2938     }
2939 
2940     return r;
2941 }
2942 
2943 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2944 {
2945     FloatParts pa = uint_to_float(a, scale, status);
2946     return float16_round_pack_canonical(pa, status);
2947 }
2948 
2949 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2950 {
2951     return uint64_to_float16_scalbn(a, scale, status);
2952 }
2953 
2954 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2955 {
2956     return uint64_to_float16_scalbn(a, scale, status);
2957 }
2958 
2959 float16 uint64_to_float16(uint64_t a, float_status *status)
2960 {
2961     return uint64_to_float16_scalbn(a, 0, status);
2962 }
2963 
2964 float16 uint32_to_float16(uint32_t a, float_status *status)
2965 {
2966     return uint64_to_float16_scalbn(a, 0, status);
2967 }
2968 
2969 float16 uint16_to_float16(uint16_t a, float_status *status)
2970 {
2971     return uint64_to_float16_scalbn(a, 0, status);
2972 }
2973 
2974 float16 uint8_to_float16(uint8_t a, float_status *status)
2975 {
2976     return uint64_to_float16_scalbn(a, 0, status);
2977 }
2978 
2979 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2980 {
2981     FloatParts pa = uint_to_float(a, scale, status);
2982     return float32_round_pack_canonical(pa, status);
2983 }
2984 
2985 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2986 {
2987     return uint64_to_float32_scalbn(a, scale, status);
2988 }
2989 
2990 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2991 {
2992     return uint64_to_float32_scalbn(a, scale, status);
2993 }
2994 
2995 float32 uint64_to_float32(uint64_t a, float_status *status)
2996 {
2997     return uint64_to_float32_scalbn(a, 0, status);
2998 }
2999 
3000 float32 uint32_to_float32(uint32_t a, float_status *status)
3001 {
3002     return uint64_to_float32_scalbn(a, 0, status);
3003 }
3004 
3005 float32 uint16_to_float32(uint16_t a, float_status *status)
3006 {
3007     return uint64_to_float32_scalbn(a, 0, status);
3008 }
3009 
3010 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
3011 {
3012     FloatParts pa = uint_to_float(a, scale, status);
3013     return float64_round_pack_canonical(pa, status);
3014 }
3015 
3016 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
3017 {
3018     return uint64_to_float64_scalbn(a, scale, status);
3019 }
3020 
3021 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
3022 {
3023     return uint64_to_float64_scalbn(a, scale, status);
3024 }
3025 
3026 float64 uint64_to_float64(uint64_t a, float_status *status)
3027 {
3028     return uint64_to_float64_scalbn(a, 0, status);
3029 }
3030 
3031 float64 uint32_to_float64(uint32_t a, float_status *status)
3032 {
3033     return uint64_to_float64_scalbn(a, 0, status);
3034 }
3035 
3036 float64 uint16_to_float64(uint16_t a, float_status *status)
3037 {
3038     return uint64_to_float64_scalbn(a, 0, status);
3039 }
3040 
3041 /*
3042  * Returns the result of converting the unsigned integer `a' to the
3043  * bfloat16 format.
3044  */
3045 
3046 bfloat16 uint64_to_bfloat16_scalbn(uint64_t a, int scale, float_status *status)
3047 {
3048     FloatParts pa = uint_to_float(a, scale, status);
3049     return bfloat16_round_pack_canonical(pa, status);
3050 }
3051 
3052 bfloat16 uint32_to_bfloat16_scalbn(uint32_t a, int scale, float_status *status)
3053 {
3054     return uint64_to_bfloat16_scalbn(a, scale, status);
3055 }
3056 
3057 bfloat16 uint16_to_bfloat16_scalbn(uint16_t a, int scale, float_status *status)
3058 {
3059     return uint64_to_bfloat16_scalbn(a, scale, status);
3060 }
3061 
3062 bfloat16 uint64_to_bfloat16(uint64_t a, float_status *status)
3063 {
3064     return uint64_to_bfloat16_scalbn(a, 0, status);
3065 }
3066 
3067 bfloat16 uint32_to_bfloat16(uint32_t a, float_status *status)
3068 {
3069     return uint64_to_bfloat16_scalbn(a, 0, status);
3070 }
3071 
3072 bfloat16 uint16_to_bfloat16(uint16_t a, float_status *status)
3073 {
3074     return uint64_to_bfloat16_scalbn(a, 0, status);
3075 }
3076 
3077 /* Float Min/Max */
3078 /* min() and max() functions. These can't be implemented as
3079  * 'compare and pick one input' because that would mishandle
3080  * NaNs and +0 vs -0.
3081  *
3082  * minnum() and maxnum() functions. These are similar to the min()
3083  * and max() functions but if one of the arguments is a QNaN and
3084  * the other is numerical then the numerical argument is returned.
3085  * SNaNs will get quietened before being returned.
3086  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
3087  * and maxNum() operations. min() and max() are the typical min/max
3088  * semantics provided by many CPUs which predate that specification.
3089  *
3090  * minnummag() and maxnummag() functions correspond to minNumMag()
3091  * and minNumMag() from the IEEE-754 2008.
3092  */
3093 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
3094                                 bool ieee, bool ismag, float_status *s)
3095 {
3096     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
3097         if (ieee) {
3098             /* Takes two floating-point values `a' and `b', one of
3099              * which is a NaN, and returns the appropriate NaN
3100              * result. If either `a' or `b' is a signaling NaN,
3101              * the invalid exception is raised.
3102              */
3103             if (is_snan(a.cls) || is_snan(b.cls)) {
3104                 return pick_nan(a, b, s);
3105             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
3106                 return b;
3107             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
3108                 return a;
3109             }
3110         }
3111         return pick_nan(a, b, s);
3112     } else {
3113         int a_exp, b_exp;
3114 
3115         switch (a.cls) {
3116         case float_class_normal:
3117             a_exp = a.exp;
3118             break;
3119         case float_class_inf:
3120             a_exp = INT_MAX;
3121             break;
3122         case float_class_zero:
3123             a_exp = INT_MIN;
3124             break;
3125         default:
3126             g_assert_not_reached();
3127             break;
3128         }
3129         switch (b.cls) {
3130         case float_class_normal:
3131             b_exp = b.exp;
3132             break;
3133         case float_class_inf:
3134             b_exp = INT_MAX;
3135             break;
3136         case float_class_zero:
3137             b_exp = INT_MIN;
3138             break;
3139         default:
3140             g_assert_not_reached();
3141             break;
3142         }
3143 
3144         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
3145             bool a_less = a_exp < b_exp;
3146             if (a_exp == b_exp) {
3147                 a_less = a.frac < b.frac;
3148             }
3149             return a_less ^ ismin ? b : a;
3150         }
3151 
3152         if (a.sign == b.sign) {
3153             bool a_less = a_exp < b_exp;
3154             if (a_exp == b_exp) {
3155                 a_less = a.frac < b.frac;
3156             }
3157             return a.sign ^ a_less ^ ismin ? b : a;
3158         } else {
3159             return a.sign ^ ismin ? b : a;
3160         }
3161     }
3162 }
3163 
3164 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
3165 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
3166                                      float_status *s)                   \
3167 {                                                                       \
3168     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
3169     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
3170     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3171                                                                         \
3172     return float ## sz ## _round_pack_canonical(pr, s);                 \
3173 }
3174 
3175 MINMAX(16, min, true, false, false)
3176 MINMAX(16, minnum, true, true, false)
3177 MINMAX(16, minnummag, true, true, true)
3178 MINMAX(16, max, false, false, false)
3179 MINMAX(16, maxnum, false, true, false)
3180 MINMAX(16, maxnummag, false, true, true)
3181 
3182 MINMAX(32, min, true, false, false)
3183 MINMAX(32, minnum, true, true, false)
3184 MINMAX(32, minnummag, true, true, true)
3185 MINMAX(32, max, false, false, false)
3186 MINMAX(32, maxnum, false, true, false)
3187 MINMAX(32, maxnummag, false, true, true)
3188 
3189 MINMAX(64, min, true, false, false)
3190 MINMAX(64, minnum, true, true, false)
3191 MINMAX(64, minnummag, true, true, true)
3192 MINMAX(64, max, false, false, false)
3193 MINMAX(64, maxnum, false, true, false)
3194 MINMAX(64, maxnummag, false, true, true)
3195 
3196 #undef MINMAX
3197 
3198 #define BF16_MINMAX(name, ismin, isiee, ismag)                          \
3199 bfloat16 bfloat16_ ## name(bfloat16 a, bfloat16 b, float_status *s)     \
3200 {                                                                       \
3201     FloatParts pa = bfloat16_unpack_canonical(a, s);                    \
3202     FloatParts pb = bfloat16_unpack_canonical(b, s);                    \
3203     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
3204                                                                         \
3205     return bfloat16_round_pack_canonical(pr, s);                        \
3206 }
3207 
3208 BF16_MINMAX(min, true, false, false)
3209 BF16_MINMAX(minnum, true, true, false)
3210 BF16_MINMAX(minnummag, true, true, true)
3211 BF16_MINMAX(max, false, false, false)
3212 BF16_MINMAX(maxnum, false, true, false)
3213 BF16_MINMAX(maxnummag, false, true, true)
3214 
3215 #undef BF16_MINMAX
3216 
3217 /* Floating point compare */
3218 static FloatRelation compare_floats(FloatParts a, FloatParts b, bool is_quiet,
3219                                     float_status *s)
3220 {
3221     if (is_nan(a.cls) || is_nan(b.cls)) {
3222         if (!is_quiet ||
3223             a.cls == float_class_snan ||
3224             b.cls == float_class_snan) {
3225             s->float_exception_flags |= float_flag_invalid;
3226         }
3227         return float_relation_unordered;
3228     }
3229 
3230     if (a.cls == float_class_zero) {
3231         if (b.cls == float_class_zero) {
3232             return float_relation_equal;
3233         }
3234         return b.sign ? float_relation_greater : float_relation_less;
3235     } else if (b.cls == float_class_zero) {
3236         return a.sign ? float_relation_less : float_relation_greater;
3237     }
3238 
3239     /* The only really important thing about infinity is its sign. If
3240      * both are infinities the sign marks the smallest of the two.
3241      */
3242     if (a.cls == float_class_inf) {
3243         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
3244             return float_relation_equal;
3245         }
3246         return a.sign ? float_relation_less : float_relation_greater;
3247     } else if (b.cls == float_class_inf) {
3248         return b.sign ? float_relation_greater : float_relation_less;
3249     }
3250 
3251     if (a.sign != b.sign) {
3252         return a.sign ? float_relation_less : float_relation_greater;
3253     }
3254 
3255     if (a.exp == b.exp) {
3256         if (a.frac == b.frac) {
3257             return float_relation_equal;
3258         }
3259         if (a.sign) {
3260             return a.frac > b.frac ?
3261                 float_relation_less : float_relation_greater;
3262         } else {
3263             return a.frac > b.frac ?
3264                 float_relation_greater : float_relation_less;
3265         }
3266     } else {
3267         if (a.sign) {
3268             return a.exp > b.exp ? float_relation_less : float_relation_greater;
3269         } else {
3270             return a.exp > b.exp ? float_relation_greater : float_relation_less;
3271         }
3272     }
3273 }
3274 
3275 #define COMPARE(name, attr, sz)                                         \
3276 static int attr                                                         \
3277 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
3278 {                                                                       \
3279     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
3280     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
3281     return compare_floats(pa, pb, is_quiet, s);                         \
3282 }
3283 
3284 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
3285 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
3286 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
3287 
3288 #undef COMPARE
3289 
3290 FloatRelation float16_compare(float16 a, float16 b, float_status *s)
3291 {
3292     return soft_f16_compare(a, b, false, s);
3293 }
3294 
3295 FloatRelation float16_compare_quiet(float16 a, float16 b, float_status *s)
3296 {
3297     return soft_f16_compare(a, b, true, s);
3298 }
3299 
3300 static FloatRelation QEMU_FLATTEN
3301 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
3302 {
3303     union_float32 ua, ub;
3304 
3305     ua.s = xa;
3306     ub.s = xb;
3307 
3308     if (QEMU_NO_HARDFLOAT) {
3309         goto soft;
3310     }
3311 
3312     float32_input_flush2(&ua.s, &ub.s, s);
3313     if (isgreaterequal(ua.h, ub.h)) {
3314         if (isgreater(ua.h, ub.h)) {
3315             return float_relation_greater;
3316         }
3317         return float_relation_equal;
3318     }
3319     if (likely(isless(ua.h, ub.h))) {
3320         return float_relation_less;
3321     }
3322     /* The only condition remaining is unordered.
3323      * Fall through to set flags.
3324      */
3325  soft:
3326     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
3327 }
3328 
3329 FloatRelation float32_compare(float32 a, float32 b, float_status *s)
3330 {
3331     return f32_compare(a, b, false, s);
3332 }
3333 
3334 FloatRelation float32_compare_quiet(float32 a, float32 b, float_status *s)
3335 {
3336     return f32_compare(a, b, true, s);
3337 }
3338 
3339 static FloatRelation QEMU_FLATTEN
3340 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3341 {
3342     union_float64 ua, ub;
3343 
3344     ua.s = xa;
3345     ub.s = xb;
3346 
3347     if (QEMU_NO_HARDFLOAT) {
3348         goto soft;
3349     }
3350 
3351     float64_input_flush2(&ua.s, &ub.s, s);
3352     if (isgreaterequal(ua.h, ub.h)) {
3353         if (isgreater(ua.h, ub.h)) {
3354             return float_relation_greater;
3355         }
3356         return float_relation_equal;
3357     }
3358     if (likely(isless(ua.h, ub.h))) {
3359         return float_relation_less;
3360     }
3361     /* The only condition remaining is unordered.
3362      * Fall through to set flags.
3363      */
3364  soft:
3365     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3366 }
3367 
3368 FloatRelation float64_compare(float64 a, float64 b, float_status *s)
3369 {
3370     return f64_compare(a, b, false, s);
3371 }
3372 
3373 FloatRelation float64_compare_quiet(float64 a, float64 b, float_status *s)
3374 {
3375     return f64_compare(a, b, true, s);
3376 }
3377 
3378 static FloatRelation QEMU_FLATTEN
3379 soft_bf16_compare(bfloat16 a, bfloat16 b, bool is_quiet, float_status *s)
3380 {
3381     FloatParts pa = bfloat16_unpack_canonical(a, s);
3382     FloatParts pb = bfloat16_unpack_canonical(b, s);
3383     return compare_floats(pa, pb, is_quiet, s);
3384 }
3385 
3386 FloatRelation bfloat16_compare(bfloat16 a, bfloat16 b, float_status *s)
3387 {
3388     return soft_bf16_compare(a, b, false, s);
3389 }
3390 
3391 FloatRelation bfloat16_compare_quiet(bfloat16 a, bfloat16 b, float_status *s)
3392 {
3393     return soft_bf16_compare(a, b, true, s);
3394 }
3395 
3396 /* Multiply A by 2 raised to the power N.  */
3397 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3398 {
3399     if (unlikely(is_nan(a.cls))) {
3400         return return_nan(a, s);
3401     }
3402     if (a.cls == float_class_normal) {
3403         /* The largest float type (even though not supported by FloatParts)
3404          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3405          * still allows rounding to infinity, without allowing overflow
3406          * within the int32_t that backs FloatParts.exp.
3407          */
3408         n = MIN(MAX(n, -0x10000), 0x10000);
3409         a.exp += n;
3410     }
3411     return a;
3412 }
3413 
3414 float16 float16_scalbn(float16 a, int n, float_status *status)
3415 {
3416     FloatParts pa = float16_unpack_canonical(a, status);
3417     FloatParts pr = scalbn_decomposed(pa, n, status);
3418     return float16_round_pack_canonical(pr, status);
3419 }
3420 
3421 float32 float32_scalbn(float32 a, int n, float_status *status)
3422 {
3423     FloatParts pa = float32_unpack_canonical(a, status);
3424     FloatParts pr = scalbn_decomposed(pa, n, status);
3425     return float32_round_pack_canonical(pr, status);
3426 }
3427 
3428 float64 float64_scalbn(float64 a, int n, float_status *status)
3429 {
3430     FloatParts pa = float64_unpack_canonical(a, status);
3431     FloatParts pr = scalbn_decomposed(pa, n, status);
3432     return float64_round_pack_canonical(pr, status);
3433 }
3434 
3435 bfloat16 bfloat16_scalbn(bfloat16 a, int n, float_status *status)
3436 {
3437     FloatParts pa = bfloat16_unpack_canonical(a, status);
3438     FloatParts pr = scalbn_decomposed(pa, n, status);
3439     return bfloat16_round_pack_canonical(pr, status);
3440 }
3441 
3442 /*
3443  * Square Root
3444  *
3445  * The old softfloat code did an approximation step before zeroing in
3446  * on the final result. However for simpleness we just compute the
3447  * square root by iterating down from the implicit bit to enough extra
3448  * bits to ensure we get a correctly rounded result.
3449  *
3450  * This does mean however the calculation is slower than before,
3451  * especially for 64 bit floats.
3452  */
3453 
3454 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3455 {
3456     uint64_t a_frac, r_frac, s_frac;
3457     int bit, last_bit;
3458 
3459     if (is_nan(a.cls)) {
3460         return return_nan(a, s);
3461     }
3462     if (a.cls == float_class_zero) {
3463         return a;  /* sqrt(+-0) = +-0 */
3464     }
3465     if (a.sign) {
3466         s->float_exception_flags |= float_flag_invalid;
3467         return parts_default_nan(s);
3468     }
3469     if (a.cls == float_class_inf) {
3470         return a;  /* sqrt(+inf) = +inf */
3471     }
3472 
3473     assert(a.cls == float_class_normal);
3474 
3475     /* We need two overflow bits at the top. Adding room for that is a
3476      * right shift. If the exponent is odd, we can discard the low bit
3477      * by multiplying the fraction by 2; that's a left shift. Combine
3478      * those and we shift right if the exponent is even.
3479      */
3480     a_frac = a.frac;
3481     if (!(a.exp & 1)) {
3482         a_frac >>= 1;
3483     }
3484     a.exp >>= 1;
3485 
3486     /* Bit-by-bit computation of sqrt.  */
3487     r_frac = 0;
3488     s_frac = 0;
3489 
3490     /* Iterate from implicit bit down to the 3 extra bits to compute a
3491      * properly rounded result. Remember we've inserted one more bit
3492      * at the top, so these positions are one less.
3493      */
3494     bit = DECOMPOSED_BINARY_POINT - 1;
3495     last_bit = MAX(p->frac_shift - 4, 0);
3496     do {
3497         uint64_t q = 1ULL << bit;
3498         uint64_t t_frac = s_frac + q;
3499         if (t_frac <= a_frac) {
3500             s_frac = t_frac + q;
3501             a_frac -= t_frac;
3502             r_frac += q;
3503         }
3504         a_frac <<= 1;
3505     } while (--bit >= last_bit);
3506 
3507     /* Undo the right shift done above. If there is any remaining
3508      * fraction, the result is inexact. Set the sticky bit.
3509      */
3510     a.frac = (r_frac << 1) + (a_frac != 0);
3511 
3512     return a;
3513 }
3514 
3515 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3516 {
3517     FloatParts pa = float16_unpack_canonical(a, status);
3518     FloatParts pr = sqrt_float(pa, status, &float16_params);
3519     return float16_round_pack_canonical(pr, status);
3520 }
3521 
3522 static float32 QEMU_SOFTFLOAT_ATTR
3523 soft_f32_sqrt(float32 a, float_status *status)
3524 {
3525     FloatParts pa = float32_unpack_canonical(a, status);
3526     FloatParts pr = sqrt_float(pa, status, &float32_params);
3527     return float32_round_pack_canonical(pr, status);
3528 }
3529 
3530 static float64 QEMU_SOFTFLOAT_ATTR
3531 soft_f64_sqrt(float64 a, float_status *status)
3532 {
3533     FloatParts pa = float64_unpack_canonical(a, status);
3534     FloatParts pr = sqrt_float(pa, status, &float64_params);
3535     return float64_round_pack_canonical(pr, status);
3536 }
3537 
3538 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3539 {
3540     union_float32 ua, ur;
3541 
3542     ua.s = xa;
3543     if (unlikely(!can_use_fpu(s))) {
3544         goto soft;
3545     }
3546 
3547     float32_input_flush1(&ua.s, s);
3548     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3549         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3550                        fpclassify(ua.h) == FP_ZERO) ||
3551                      signbit(ua.h))) {
3552             goto soft;
3553         }
3554     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3555                         float32_is_neg(ua.s))) {
3556         goto soft;
3557     }
3558     ur.h = sqrtf(ua.h);
3559     return ur.s;
3560 
3561  soft:
3562     return soft_f32_sqrt(ua.s, s);
3563 }
3564 
3565 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3566 {
3567     union_float64 ua, ur;
3568 
3569     ua.s = xa;
3570     if (unlikely(!can_use_fpu(s))) {
3571         goto soft;
3572     }
3573 
3574     float64_input_flush1(&ua.s, s);
3575     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3576         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3577                        fpclassify(ua.h) == FP_ZERO) ||
3578                      signbit(ua.h))) {
3579             goto soft;
3580         }
3581     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3582                         float64_is_neg(ua.s))) {
3583         goto soft;
3584     }
3585     ur.h = sqrt(ua.h);
3586     return ur.s;
3587 
3588  soft:
3589     return soft_f64_sqrt(ua.s, s);
3590 }
3591 
3592 bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)
3593 {
3594     FloatParts pa = bfloat16_unpack_canonical(a, status);
3595     FloatParts pr = sqrt_float(pa, status, &bfloat16_params);
3596     return bfloat16_round_pack_canonical(pr, status);
3597 }
3598 
3599 /*----------------------------------------------------------------------------
3600 | The pattern for a default generated NaN.
3601 *----------------------------------------------------------------------------*/
3602 
3603 float16 float16_default_nan(float_status *status)
3604 {
3605     FloatParts p = parts_default_nan(status);
3606     p.frac >>= float16_params.frac_shift;
3607     return float16_pack_raw(p);
3608 }
3609 
3610 float32 float32_default_nan(float_status *status)
3611 {
3612     FloatParts p = parts_default_nan(status);
3613     p.frac >>= float32_params.frac_shift;
3614     return float32_pack_raw(p);
3615 }
3616 
3617 float64 float64_default_nan(float_status *status)
3618 {
3619     FloatParts p = parts_default_nan(status);
3620     p.frac >>= float64_params.frac_shift;
3621     return float64_pack_raw(p);
3622 }
3623 
3624 float128 float128_default_nan(float_status *status)
3625 {
3626     FloatParts p = parts_default_nan(status);
3627     float128 r;
3628 
3629     /* Extrapolate from the choices made by parts_default_nan to fill
3630      * in the quad-floating format.  If the low bit is set, assume we
3631      * want to set all non-snan bits.
3632      */
3633     r.low = -(p.frac & 1);
3634     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3635     r.high |= UINT64_C(0x7FFF000000000000);
3636     r.high |= (uint64_t)p.sign << 63;
3637 
3638     return r;
3639 }
3640 
3641 bfloat16 bfloat16_default_nan(float_status *status)
3642 {
3643     FloatParts p = parts_default_nan(status);
3644     p.frac >>= bfloat16_params.frac_shift;
3645     return bfloat16_pack_raw(p);
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3650 *----------------------------------------------------------------------------*/
3651 
3652 float16 float16_silence_nan(float16 a, float_status *status)
3653 {
3654     FloatParts p = float16_unpack_raw(a);
3655     p.frac <<= float16_params.frac_shift;
3656     p = parts_silence_nan(p, status);
3657     p.frac >>= float16_params.frac_shift;
3658     return float16_pack_raw(p);
3659 }
3660 
3661 float32 float32_silence_nan(float32 a, float_status *status)
3662 {
3663     FloatParts p = float32_unpack_raw(a);
3664     p.frac <<= float32_params.frac_shift;
3665     p = parts_silence_nan(p, status);
3666     p.frac >>= float32_params.frac_shift;
3667     return float32_pack_raw(p);
3668 }
3669 
3670 float64 float64_silence_nan(float64 a, float_status *status)
3671 {
3672     FloatParts p = float64_unpack_raw(a);
3673     p.frac <<= float64_params.frac_shift;
3674     p = parts_silence_nan(p, status);
3675     p.frac >>= float64_params.frac_shift;
3676     return float64_pack_raw(p);
3677 }
3678 
3679 bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
3680 {
3681     FloatParts p = bfloat16_unpack_raw(a);
3682     p.frac <<= bfloat16_params.frac_shift;
3683     p = parts_silence_nan(p, status);
3684     p.frac >>= bfloat16_params.frac_shift;
3685     return bfloat16_pack_raw(p);
3686 }
3687 
3688 /*----------------------------------------------------------------------------
3689 | If `a' is denormal and we are in flush-to-zero mode then set the
3690 | input-denormal exception and return zero. Otherwise just return the value.
3691 *----------------------------------------------------------------------------*/
3692 
3693 static bool parts_squash_denormal(FloatParts p, float_status *status)
3694 {
3695     if (p.exp == 0 && p.frac != 0) {
3696         float_raise(float_flag_input_denormal, status);
3697         return true;
3698     }
3699 
3700     return false;
3701 }
3702 
3703 float16 float16_squash_input_denormal(float16 a, float_status *status)
3704 {
3705     if (status->flush_inputs_to_zero) {
3706         FloatParts p = float16_unpack_raw(a);
3707         if (parts_squash_denormal(p, status)) {
3708             return float16_set_sign(float16_zero, p.sign);
3709         }
3710     }
3711     return a;
3712 }
3713 
3714 float32 float32_squash_input_denormal(float32 a, float_status *status)
3715 {
3716     if (status->flush_inputs_to_zero) {
3717         FloatParts p = float32_unpack_raw(a);
3718         if (parts_squash_denormal(p, status)) {
3719             return float32_set_sign(float32_zero, p.sign);
3720         }
3721     }
3722     return a;
3723 }
3724 
3725 float64 float64_squash_input_denormal(float64 a, float_status *status)
3726 {
3727     if (status->flush_inputs_to_zero) {
3728         FloatParts p = float64_unpack_raw(a);
3729         if (parts_squash_denormal(p, status)) {
3730             return float64_set_sign(float64_zero, p.sign);
3731         }
3732     }
3733     return a;
3734 }
3735 
3736 bfloat16 bfloat16_squash_input_denormal(bfloat16 a, float_status *status)
3737 {
3738     if (status->flush_inputs_to_zero) {
3739         FloatParts p = bfloat16_unpack_raw(a);
3740         if (parts_squash_denormal(p, status)) {
3741             return bfloat16_set_sign(bfloat16_zero, p.sign);
3742         }
3743     }
3744     return a;
3745 }
3746 
3747 /*----------------------------------------------------------------------------
3748 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3749 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3750 | input.  If `zSign' is 1, the input is negated before being converted to an
3751 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3752 | is simply rounded to an integer, with the inexact exception raised if the
3753 | input cannot be represented exactly as an integer.  However, if the fixed-
3754 | point input is too large, the invalid exception is raised and the largest
3755 | positive or negative integer is returned.
3756 *----------------------------------------------------------------------------*/
3757 
3758 static int32_t roundAndPackInt32(bool zSign, uint64_t absZ,
3759                                  float_status *status)
3760 {
3761     int8_t roundingMode;
3762     bool roundNearestEven;
3763     int8_t roundIncrement, roundBits;
3764     int32_t z;
3765 
3766     roundingMode = status->float_rounding_mode;
3767     roundNearestEven = ( roundingMode == float_round_nearest_even );
3768     switch (roundingMode) {
3769     case float_round_nearest_even:
3770     case float_round_ties_away:
3771         roundIncrement = 0x40;
3772         break;
3773     case float_round_to_zero:
3774         roundIncrement = 0;
3775         break;
3776     case float_round_up:
3777         roundIncrement = zSign ? 0 : 0x7f;
3778         break;
3779     case float_round_down:
3780         roundIncrement = zSign ? 0x7f : 0;
3781         break;
3782     case float_round_to_odd:
3783         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3784         break;
3785     default:
3786         abort();
3787     }
3788     roundBits = absZ & 0x7F;
3789     absZ = ( absZ + roundIncrement )>>7;
3790     if (!(roundBits ^ 0x40) && roundNearestEven) {
3791         absZ &= ~1;
3792     }
3793     z = absZ;
3794     if ( zSign ) z = - z;
3795     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3796         float_raise(float_flag_invalid, status);
3797         return zSign ? INT32_MIN : INT32_MAX;
3798     }
3799     if (roundBits) {
3800         status->float_exception_flags |= float_flag_inexact;
3801     }
3802     return z;
3803 
3804 }
3805 
3806 /*----------------------------------------------------------------------------
3807 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3808 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3809 | and returns the properly rounded 64-bit integer corresponding to the input.
3810 | If `zSign' is 1, the input is negated before being converted to an integer.
3811 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3812 | the inexact exception raised if the input cannot be represented exactly as
3813 | an integer.  However, if the fixed-point input is too large, the invalid
3814 | exception is raised and the largest positive or negative integer is
3815 | returned.
3816 *----------------------------------------------------------------------------*/
3817 
3818 static int64_t roundAndPackInt64(bool zSign, uint64_t absZ0, uint64_t absZ1,
3819                                float_status *status)
3820 {
3821     int8_t roundingMode;
3822     bool roundNearestEven, increment;
3823     int64_t z;
3824 
3825     roundingMode = status->float_rounding_mode;
3826     roundNearestEven = ( roundingMode == float_round_nearest_even );
3827     switch (roundingMode) {
3828     case float_round_nearest_even:
3829     case float_round_ties_away:
3830         increment = ((int64_t) absZ1 < 0);
3831         break;
3832     case float_round_to_zero:
3833         increment = 0;
3834         break;
3835     case float_round_up:
3836         increment = !zSign && absZ1;
3837         break;
3838     case float_round_down:
3839         increment = zSign && absZ1;
3840         break;
3841     case float_round_to_odd:
3842         increment = !(absZ0 & 1) && absZ1;
3843         break;
3844     default:
3845         abort();
3846     }
3847     if ( increment ) {
3848         ++absZ0;
3849         if ( absZ0 == 0 ) goto overflow;
3850         if (!(absZ1 << 1) && roundNearestEven) {
3851             absZ0 &= ~1;
3852         }
3853     }
3854     z = absZ0;
3855     if ( zSign ) z = - z;
3856     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3857  overflow:
3858         float_raise(float_flag_invalid, status);
3859         return zSign ? INT64_MIN : INT64_MAX;
3860     }
3861     if (absZ1) {
3862         status->float_exception_flags |= float_flag_inexact;
3863     }
3864     return z;
3865 
3866 }
3867 
3868 /*----------------------------------------------------------------------------
3869 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3870 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3871 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3872 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3873 | with the inexact exception raised if the input cannot be represented exactly
3874 | as an integer.  However, if the fixed-point input is too large, the invalid
3875 | exception is raised and the largest unsigned integer is returned.
3876 *----------------------------------------------------------------------------*/
3877 
3878 static int64_t roundAndPackUint64(bool zSign, uint64_t absZ0,
3879                                 uint64_t absZ1, float_status *status)
3880 {
3881     int8_t roundingMode;
3882     bool roundNearestEven, increment;
3883 
3884     roundingMode = status->float_rounding_mode;
3885     roundNearestEven = (roundingMode == float_round_nearest_even);
3886     switch (roundingMode) {
3887     case float_round_nearest_even:
3888     case float_round_ties_away:
3889         increment = ((int64_t)absZ1 < 0);
3890         break;
3891     case float_round_to_zero:
3892         increment = 0;
3893         break;
3894     case float_round_up:
3895         increment = !zSign && absZ1;
3896         break;
3897     case float_round_down:
3898         increment = zSign && absZ1;
3899         break;
3900     case float_round_to_odd:
3901         increment = !(absZ0 & 1) && absZ1;
3902         break;
3903     default:
3904         abort();
3905     }
3906     if (increment) {
3907         ++absZ0;
3908         if (absZ0 == 0) {
3909             float_raise(float_flag_invalid, status);
3910             return UINT64_MAX;
3911         }
3912         if (!(absZ1 << 1) && roundNearestEven) {
3913             absZ0 &= ~1;
3914         }
3915     }
3916 
3917     if (zSign && absZ0) {
3918         float_raise(float_flag_invalid, status);
3919         return 0;
3920     }
3921 
3922     if (absZ1) {
3923         status->float_exception_flags |= float_flag_inexact;
3924     }
3925     return absZ0;
3926 }
3927 
3928 /*----------------------------------------------------------------------------
3929 | Normalizes the subnormal single-precision floating-point value represented
3930 | by the denormalized significand `aSig'.  The normalized exponent and
3931 | significand are stored at the locations pointed to by `zExpPtr' and
3932 | `zSigPtr', respectively.
3933 *----------------------------------------------------------------------------*/
3934 
3935 static void
3936  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3937 {
3938     int8_t shiftCount;
3939 
3940     shiftCount = clz32(aSig) - 8;
3941     *zSigPtr = aSig<<shiftCount;
3942     *zExpPtr = 1 - shiftCount;
3943 
3944 }
3945 
3946 /*----------------------------------------------------------------------------
3947 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3948 | and significand `zSig', and returns the proper single-precision floating-
3949 | point value corresponding to the abstract input.  Ordinarily, the abstract
3950 | value is simply rounded and packed into the single-precision format, with
3951 | the inexact exception raised if the abstract input cannot be represented
3952 | exactly.  However, if the abstract value is too large, the overflow and
3953 | inexact exceptions are raised and an infinity or maximal finite value is
3954 | returned.  If the abstract value is too small, the input value is rounded to
3955 | a subnormal number, and the underflow and inexact exceptions are raised if
3956 | the abstract input cannot be represented exactly as a subnormal single-
3957 | precision floating-point number.
3958 |     The input significand `zSig' has its binary point between bits 30
3959 | and 29, which is 7 bits to the left of the usual location.  This shifted
3960 | significand must be normalized or smaller.  If `zSig' is not normalized,
3961 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3962 | and it must not require rounding.  In the usual case that `zSig' is
3963 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3964 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3965 | Binary Floating-Point Arithmetic.
3966 *----------------------------------------------------------------------------*/
3967 
3968 static float32 roundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
3969                                    float_status *status)
3970 {
3971     int8_t roundingMode;
3972     bool roundNearestEven;
3973     int8_t roundIncrement, roundBits;
3974     bool isTiny;
3975 
3976     roundingMode = status->float_rounding_mode;
3977     roundNearestEven = ( roundingMode == float_round_nearest_even );
3978     switch (roundingMode) {
3979     case float_round_nearest_even:
3980     case float_round_ties_away:
3981         roundIncrement = 0x40;
3982         break;
3983     case float_round_to_zero:
3984         roundIncrement = 0;
3985         break;
3986     case float_round_up:
3987         roundIncrement = zSign ? 0 : 0x7f;
3988         break;
3989     case float_round_down:
3990         roundIncrement = zSign ? 0x7f : 0;
3991         break;
3992     case float_round_to_odd:
3993         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3994         break;
3995     default:
3996         abort();
3997         break;
3998     }
3999     roundBits = zSig & 0x7F;
4000     if ( 0xFD <= (uint16_t) zExp ) {
4001         if (    ( 0xFD < zExp )
4002              || (    ( zExp == 0xFD )
4003                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
4004            ) {
4005             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4006                                    roundIncrement != 0;
4007             float_raise(float_flag_overflow | float_flag_inexact, status);
4008             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
4009         }
4010         if ( zExp < 0 ) {
4011             if (status->flush_to_zero) {
4012                 float_raise(float_flag_output_denormal, status);
4013                 return packFloat32(zSign, 0, 0);
4014             }
4015             isTiny = status->tininess_before_rounding
4016                   || (zExp < -1)
4017                   || (zSig + roundIncrement < 0x80000000);
4018             shift32RightJamming( zSig, - zExp, &zSig );
4019             zExp = 0;
4020             roundBits = zSig & 0x7F;
4021             if (isTiny && roundBits) {
4022                 float_raise(float_flag_underflow, status);
4023             }
4024             if (roundingMode == float_round_to_odd) {
4025                 /*
4026                  * For round-to-odd case, the roundIncrement depends on
4027                  * zSig which just changed.
4028                  */
4029                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
4030             }
4031         }
4032     }
4033     if (roundBits) {
4034         status->float_exception_flags |= float_flag_inexact;
4035     }
4036     zSig = ( zSig + roundIncrement )>>7;
4037     if (!(roundBits ^ 0x40) && roundNearestEven) {
4038         zSig &= ~1;
4039     }
4040     if ( zSig == 0 ) zExp = 0;
4041     return packFloat32( zSign, zExp, zSig );
4042 
4043 }
4044 
4045 /*----------------------------------------------------------------------------
4046 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4047 | and significand `zSig', and returns the proper single-precision floating-
4048 | point value corresponding to the abstract input.  This routine is just like
4049 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
4050 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4051 | floating-point exponent.
4052 *----------------------------------------------------------------------------*/
4053 
4054 static float32
4055  normalizeRoundAndPackFloat32(bool zSign, int zExp, uint32_t zSig,
4056                               float_status *status)
4057 {
4058     int8_t shiftCount;
4059 
4060     shiftCount = clz32(zSig) - 1;
4061     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
4062                                status);
4063 
4064 }
4065 
4066 /*----------------------------------------------------------------------------
4067 | Normalizes the subnormal double-precision floating-point value represented
4068 | by the denormalized significand `aSig'.  The normalized exponent and
4069 | significand are stored at the locations pointed to by `zExpPtr' and
4070 | `zSigPtr', respectively.
4071 *----------------------------------------------------------------------------*/
4072 
4073 static void
4074  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
4075 {
4076     int8_t shiftCount;
4077 
4078     shiftCount = clz64(aSig) - 11;
4079     *zSigPtr = aSig<<shiftCount;
4080     *zExpPtr = 1 - shiftCount;
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
4086 | double-precision floating-point value, returning the result.  After being
4087 | shifted into the proper positions, the three fields are simply added
4088 | together to form the result.  This means that any integer portion of `zSig'
4089 | will be added into the exponent.  Since a properly normalized significand
4090 | will have an integer portion equal to 1, the `zExp' input should be 1 less
4091 | than the desired result exponent whenever `zSig' is a complete, normalized
4092 | significand.
4093 *----------------------------------------------------------------------------*/
4094 
4095 static inline float64 packFloat64(bool zSign, int zExp, uint64_t zSig)
4096 {
4097 
4098     return make_float64(
4099         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
4100 
4101 }
4102 
4103 /*----------------------------------------------------------------------------
4104 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4105 | and significand `zSig', and returns the proper double-precision floating-
4106 | point value corresponding to the abstract input.  Ordinarily, the abstract
4107 | value is simply rounded and packed into the double-precision format, with
4108 | the inexact exception raised if the abstract input cannot be represented
4109 | exactly.  However, if the abstract value is too large, the overflow and
4110 | inexact exceptions are raised and an infinity or maximal finite value is
4111 | returned.  If the abstract value is too small, the input value is rounded to
4112 | a subnormal number, and the underflow and inexact exceptions are raised if
4113 | the abstract input cannot be represented exactly as a subnormal double-
4114 | precision floating-point number.
4115 |     The input significand `zSig' has its binary point between bits 62
4116 | and 61, which is 10 bits to the left of the usual location.  This shifted
4117 | significand must be normalized or smaller.  If `zSig' is not normalized,
4118 | `zExp' must be 0; in that case, the result returned is a subnormal number,
4119 | and it must not require rounding.  In the usual case that `zSig' is
4120 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
4121 | The handling of underflow and overflow follows the IEC/IEEE Standard for
4122 | Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124 
4125 static float64 roundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4126                                    float_status *status)
4127 {
4128     int8_t roundingMode;
4129     bool roundNearestEven;
4130     int roundIncrement, roundBits;
4131     bool isTiny;
4132 
4133     roundingMode = status->float_rounding_mode;
4134     roundNearestEven = ( roundingMode == float_round_nearest_even );
4135     switch (roundingMode) {
4136     case float_round_nearest_even:
4137     case float_round_ties_away:
4138         roundIncrement = 0x200;
4139         break;
4140     case float_round_to_zero:
4141         roundIncrement = 0;
4142         break;
4143     case float_round_up:
4144         roundIncrement = zSign ? 0 : 0x3ff;
4145         break;
4146     case float_round_down:
4147         roundIncrement = zSign ? 0x3ff : 0;
4148         break;
4149     case float_round_to_odd:
4150         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4151         break;
4152     default:
4153         abort();
4154     }
4155     roundBits = zSig & 0x3FF;
4156     if ( 0x7FD <= (uint16_t) zExp ) {
4157         if (    ( 0x7FD < zExp )
4158              || (    ( zExp == 0x7FD )
4159                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
4160            ) {
4161             bool overflow_to_inf = roundingMode != float_round_to_odd &&
4162                                    roundIncrement != 0;
4163             float_raise(float_flag_overflow | float_flag_inexact, status);
4164             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
4165         }
4166         if ( zExp < 0 ) {
4167             if (status->flush_to_zero) {
4168                 float_raise(float_flag_output_denormal, status);
4169                 return packFloat64(zSign, 0, 0);
4170             }
4171             isTiny = status->tininess_before_rounding
4172                   || (zExp < -1)
4173                   || (zSig + roundIncrement < UINT64_C(0x8000000000000000));
4174             shift64RightJamming( zSig, - zExp, &zSig );
4175             zExp = 0;
4176             roundBits = zSig & 0x3FF;
4177             if (isTiny && roundBits) {
4178                 float_raise(float_flag_underflow, status);
4179             }
4180             if (roundingMode == float_round_to_odd) {
4181                 /*
4182                  * For round-to-odd case, the roundIncrement depends on
4183                  * zSig which just changed.
4184                  */
4185                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
4186             }
4187         }
4188     }
4189     if (roundBits) {
4190         status->float_exception_flags |= float_flag_inexact;
4191     }
4192     zSig = ( zSig + roundIncrement )>>10;
4193     if (!(roundBits ^ 0x200) && roundNearestEven) {
4194         zSig &= ~1;
4195     }
4196     if ( zSig == 0 ) zExp = 0;
4197     return packFloat64( zSign, zExp, zSig );
4198 
4199 }
4200 
4201 /*----------------------------------------------------------------------------
4202 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4203 | and significand `zSig', and returns the proper double-precision floating-
4204 | point value corresponding to the abstract input.  This routine is just like
4205 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
4206 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
4207 | floating-point exponent.
4208 *----------------------------------------------------------------------------*/
4209 
4210 static float64
4211  normalizeRoundAndPackFloat64(bool zSign, int zExp, uint64_t zSig,
4212                               float_status *status)
4213 {
4214     int8_t shiftCount;
4215 
4216     shiftCount = clz64(zSig) - 1;
4217     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
4218                                status);
4219 
4220 }
4221 
4222 /*----------------------------------------------------------------------------
4223 | Normalizes the subnormal extended double-precision floating-point value
4224 | represented by the denormalized significand `aSig'.  The normalized exponent
4225 | and significand are stored at the locations pointed to by `zExpPtr' and
4226 | `zSigPtr', respectively.
4227 *----------------------------------------------------------------------------*/
4228 
4229 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
4230                                 uint64_t *zSigPtr)
4231 {
4232     int8_t shiftCount;
4233 
4234     shiftCount = clz64(aSig);
4235     *zSigPtr = aSig<<shiftCount;
4236     *zExpPtr = 1 - shiftCount;
4237 }
4238 
4239 /*----------------------------------------------------------------------------
4240 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4241 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
4242 | and returns the proper extended double-precision floating-point value
4243 | corresponding to the abstract input.  Ordinarily, the abstract value is
4244 | rounded and packed into the extended double-precision format, with the
4245 | inexact exception raised if the abstract input cannot be represented
4246 | exactly.  However, if the abstract value is too large, the overflow and
4247 | inexact exceptions are raised and an infinity or maximal finite value is
4248 | returned.  If the abstract value is too small, the input value is rounded to
4249 | a subnormal number, and the underflow and inexact exceptions are raised if
4250 | the abstract input cannot be represented exactly as a subnormal extended
4251 | double-precision floating-point number.
4252 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
4253 | number of bits as single or double precision, respectively.  Otherwise, the
4254 | result is rounded to the full precision of the extended double-precision
4255 | format.
4256 |     The input significand must be normalized or smaller.  If the input
4257 | significand is not normalized, `zExp' must be 0; in that case, the result
4258 | returned is a subnormal number, and it must not require rounding.  The
4259 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
4260 | Floating-Point Arithmetic.
4261 *----------------------------------------------------------------------------*/
4262 
4263 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, bool zSign,
4264                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
4265                               float_status *status)
4266 {
4267     int8_t roundingMode;
4268     bool roundNearestEven, increment, isTiny;
4269     int64_t roundIncrement, roundMask, roundBits;
4270 
4271     roundingMode = status->float_rounding_mode;
4272     roundNearestEven = ( roundingMode == float_round_nearest_even );
4273     if ( roundingPrecision == 80 ) goto precision80;
4274     if ( roundingPrecision == 64 ) {
4275         roundIncrement = UINT64_C(0x0000000000000400);
4276         roundMask = UINT64_C(0x00000000000007FF);
4277     }
4278     else if ( roundingPrecision == 32 ) {
4279         roundIncrement = UINT64_C(0x0000008000000000);
4280         roundMask = UINT64_C(0x000000FFFFFFFFFF);
4281     }
4282     else {
4283         goto precision80;
4284     }
4285     zSig0 |= ( zSig1 != 0 );
4286     switch (roundingMode) {
4287     case float_round_nearest_even:
4288     case float_round_ties_away:
4289         break;
4290     case float_round_to_zero:
4291         roundIncrement = 0;
4292         break;
4293     case float_round_up:
4294         roundIncrement = zSign ? 0 : roundMask;
4295         break;
4296     case float_round_down:
4297         roundIncrement = zSign ? roundMask : 0;
4298         break;
4299     default:
4300         abort();
4301     }
4302     roundBits = zSig0 & roundMask;
4303     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4304         if (    ( 0x7FFE < zExp )
4305              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
4306            ) {
4307             goto overflow;
4308         }
4309         if ( zExp <= 0 ) {
4310             if (status->flush_to_zero) {
4311                 float_raise(float_flag_output_denormal, status);
4312                 return packFloatx80(zSign, 0, 0);
4313             }
4314             isTiny = status->tininess_before_rounding
4315                   || (zExp < 0 )
4316                   || (zSig0 <= zSig0 + roundIncrement);
4317             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
4318             zExp = 0;
4319             roundBits = zSig0 & roundMask;
4320             if (isTiny && roundBits) {
4321                 float_raise(float_flag_underflow, status);
4322             }
4323             if (roundBits) {
4324                 status->float_exception_flags |= float_flag_inexact;
4325             }
4326             zSig0 += roundIncrement;
4327             if ( (int64_t) zSig0 < 0 ) zExp = 1;
4328             roundIncrement = roundMask + 1;
4329             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4330                 roundMask |= roundIncrement;
4331             }
4332             zSig0 &= ~ roundMask;
4333             return packFloatx80( zSign, zExp, zSig0 );
4334         }
4335     }
4336     if (roundBits) {
4337         status->float_exception_flags |= float_flag_inexact;
4338     }
4339     zSig0 += roundIncrement;
4340     if ( zSig0 < roundIncrement ) {
4341         ++zExp;
4342         zSig0 = UINT64_C(0x8000000000000000);
4343     }
4344     roundIncrement = roundMask + 1;
4345     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
4346         roundMask |= roundIncrement;
4347     }
4348     zSig0 &= ~ roundMask;
4349     if ( zSig0 == 0 ) zExp = 0;
4350     return packFloatx80( zSign, zExp, zSig0 );
4351  precision80:
4352     switch (roundingMode) {
4353     case float_round_nearest_even:
4354     case float_round_ties_away:
4355         increment = ((int64_t)zSig1 < 0);
4356         break;
4357     case float_round_to_zero:
4358         increment = 0;
4359         break;
4360     case float_round_up:
4361         increment = !zSign && zSig1;
4362         break;
4363     case float_round_down:
4364         increment = zSign && zSig1;
4365         break;
4366     default:
4367         abort();
4368     }
4369     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
4370         if (    ( 0x7FFE < zExp )
4371              || (    ( zExp == 0x7FFE )
4372                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
4373                   && increment
4374                 )
4375            ) {
4376             roundMask = 0;
4377  overflow:
4378             float_raise(float_flag_overflow | float_flag_inexact, status);
4379             if (    ( roundingMode == float_round_to_zero )
4380                  || ( zSign && ( roundingMode == float_round_up ) )
4381                  || ( ! zSign && ( roundingMode == float_round_down ) )
4382                ) {
4383                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
4384             }
4385             return packFloatx80(zSign,
4386                                 floatx80_infinity_high,
4387                                 floatx80_infinity_low);
4388         }
4389         if ( zExp <= 0 ) {
4390             isTiny = status->tininess_before_rounding
4391                   || (zExp < 0)
4392                   || !increment
4393                   || (zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF));
4394             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4395             zExp = 0;
4396             if (isTiny && zSig1) {
4397                 float_raise(float_flag_underflow, status);
4398             }
4399             if (zSig1) {
4400                 status->float_exception_flags |= float_flag_inexact;
4401             }
4402             switch (roundingMode) {
4403             case float_round_nearest_even:
4404             case float_round_ties_away:
4405                 increment = ((int64_t)zSig1 < 0);
4406                 break;
4407             case float_round_to_zero:
4408                 increment = 0;
4409                 break;
4410             case float_round_up:
4411                 increment = !zSign && zSig1;
4412                 break;
4413             case float_round_down:
4414                 increment = zSign && zSig1;
4415                 break;
4416             default:
4417                 abort();
4418             }
4419             if ( increment ) {
4420                 ++zSig0;
4421                 if (!(zSig1 << 1) && roundNearestEven) {
4422                     zSig0 &= ~1;
4423                 }
4424                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4425             }
4426             return packFloatx80( zSign, zExp, zSig0 );
4427         }
4428     }
4429     if (zSig1) {
4430         status->float_exception_flags |= float_flag_inexact;
4431     }
4432     if ( increment ) {
4433         ++zSig0;
4434         if ( zSig0 == 0 ) {
4435             ++zExp;
4436             zSig0 = UINT64_C(0x8000000000000000);
4437         }
4438         else {
4439             if (!(zSig1 << 1) && roundNearestEven) {
4440                 zSig0 &= ~1;
4441             }
4442         }
4443     }
4444     else {
4445         if ( zSig0 == 0 ) zExp = 0;
4446     }
4447     return packFloatx80( zSign, zExp, zSig0 );
4448 
4449 }
4450 
4451 /*----------------------------------------------------------------------------
4452 | Takes an abstract floating-point value having sign `zSign', exponent
4453 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4454 | and returns the proper extended double-precision floating-point value
4455 | corresponding to the abstract input.  This routine is just like
4456 | `roundAndPackFloatx80' except that the input significand does not have to be
4457 | normalized.
4458 *----------------------------------------------------------------------------*/
4459 
4460 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4461                                        bool zSign, int32_t zExp,
4462                                        uint64_t zSig0, uint64_t zSig1,
4463                                        float_status *status)
4464 {
4465     int8_t shiftCount;
4466 
4467     if ( zSig0 == 0 ) {
4468         zSig0 = zSig1;
4469         zSig1 = 0;
4470         zExp -= 64;
4471     }
4472     shiftCount = clz64(zSig0);
4473     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4474     zExp -= shiftCount;
4475     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4476                                 zSig0, zSig1, status);
4477 
4478 }
4479 
4480 /*----------------------------------------------------------------------------
4481 | Returns the least-significant 64 fraction bits of the quadruple-precision
4482 | floating-point value `a'.
4483 *----------------------------------------------------------------------------*/
4484 
4485 static inline uint64_t extractFloat128Frac1( float128 a )
4486 {
4487 
4488     return a.low;
4489 
4490 }
4491 
4492 /*----------------------------------------------------------------------------
4493 | Returns the most-significant 48 fraction bits of the quadruple-precision
4494 | floating-point value `a'.
4495 *----------------------------------------------------------------------------*/
4496 
4497 static inline uint64_t extractFloat128Frac0( float128 a )
4498 {
4499 
4500     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4501 
4502 }
4503 
4504 /*----------------------------------------------------------------------------
4505 | Returns the exponent bits of the quadruple-precision floating-point value
4506 | `a'.
4507 *----------------------------------------------------------------------------*/
4508 
4509 static inline int32_t extractFloat128Exp( float128 a )
4510 {
4511 
4512     return ( a.high>>48 ) & 0x7FFF;
4513 
4514 }
4515 
4516 /*----------------------------------------------------------------------------
4517 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4518 *----------------------------------------------------------------------------*/
4519 
4520 static inline bool extractFloat128Sign(float128 a)
4521 {
4522     return a.high >> 63;
4523 }
4524 
4525 /*----------------------------------------------------------------------------
4526 | Normalizes the subnormal quadruple-precision floating-point value
4527 | represented by the denormalized significand formed by the concatenation of
4528 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4529 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4530 | significand are stored at the location pointed to by `zSig0Ptr', and the
4531 | least significant 64 bits of the normalized significand are stored at the
4532 | location pointed to by `zSig1Ptr'.
4533 *----------------------------------------------------------------------------*/
4534 
4535 static void
4536  normalizeFloat128Subnormal(
4537      uint64_t aSig0,
4538      uint64_t aSig1,
4539      int32_t *zExpPtr,
4540      uint64_t *zSig0Ptr,
4541      uint64_t *zSig1Ptr
4542  )
4543 {
4544     int8_t shiftCount;
4545 
4546     if ( aSig0 == 0 ) {
4547         shiftCount = clz64(aSig1) - 15;
4548         if ( shiftCount < 0 ) {
4549             *zSig0Ptr = aSig1>>( - shiftCount );
4550             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4551         }
4552         else {
4553             *zSig0Ptr = aSig1<<shiftCount;
4554             *zSig1Ptr = 0;
4555         }
4556         *zExpPtr = - shiftCount - 63;
4557     }
4558     else {
4559         shiftCount = clz64(aSig0) - 15;
4560         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4561         *zExpPtr = 1 - shiftCount;
4562     }
4563 
4564 }
4565 
4566 /*----------------------------------------------------------------------------
4567 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4568 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4569 | floating-point value, returning the result.  After being shifted into the
4570 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4571 | added together to form the most significant 32 bits of the result.  This
4572 | means that any integer portion of `zSig0' will be added into the exponent.
4573 | Since a properly normalized significand will have an integer portion equal
4574 | to 1, the `zExp' input should be 1 less than the desired result exponent
4575 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4576 | significand.
4577 *----------------------------------------------------------------------------*/
4578 
4579 static inline float128
4580 packFloat128(bool zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1)
4581 {
4582     float128 z;
4583 
4584     z.low = zSig1;
4585     z.high = ((uint64_t)zSign << 63) + ((uint64_t)zExp << 48) + zSig0;
4586     return z;
4587 }
4588 
4589 /*----------------------------------------------------------------------------
4590 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4591 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4592 | and `zSig2', and returns the proper quadruple-precision floating-point value
4593 | corresponding to the abstract input.  Ordinarily, the abstract value is
4594 | simply rounded and packed into the quadruple-precision format, with the
4595 | inexact exception raised if the abstract input cannot be represented
4596 | exactly.  However, if the abstract value is too large, the overflow and
4597 | inexact exceptions are raised and an infinity or maximal finite value is
4598 | returned.  If the abstract value is too small, the input value is rounded to
4599 | a subnormal number, and the underflow and inexact exceptions are raised if
4600 | the abstract input cannot be represented exactly as a subnormal quadruple-
4601 | precision floating-point number.
4602 |     The input significand must be normalized or smaller.  If the input
4603 | significand is not normalized, `zExp' must be 0; in that case, the result
4604 | returned is a subnormal number, and it must not require rounding.  In the
4605 | usual case that the input significand is normalized, `zExp' must be 1 less
4606 | than the ``true'' floating-point exponent.  The handling of underflow and
4607 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4608 *----------------------------------------------------------------------------*/
4609 
4610 static float128 roundAndPackFloat128(bool zSign, int32_t zExp,
4611                                      uint64_t zSig0, uint64_t zSig1,
4612                                      uint64_t zSig2, float_status *status)
4613 {
4614     int8_t roundingMode;
4615     bool roundNearestEven, increment, isTiny;
4616 
4617     roundingMode = status->float_rounding_mode;
4618     roundNearestEven = ( roundingMode == float_round_nearest_even );
4619     switch (roundingMode) {
4620     case float_round_nearest_even:
4621     case float_round_ties_away:
4622         increment = ((int64_t)zSig2 < 0);
4623         break;
4624     case float_round_to_zero:
4625         increment = 0;
4626         break;
4627     case float_round_up:
4628         increment = !zSign && zSig2;
4629         break;
4630     case float_round_down:
4631         increment = zSign && zSig2;
4632         break;
4633     case float_round_to_odd:
4634         increment = !(zSig1 & 0x1) && zSig2;
4635         break;
4636     default:
4637         abort();
4638     }
4639     if ( 0x7FFD <= (uint32_t) zExp ) {
4640         if (    ( 0x7FFD < zExp )
4641              || (    ( zExp == 0x7FFD )
4642                   && eq128(
4643                          UINT64_C(0x0001FFFFFFFFFFFF),
4644                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4645                          zSig0,
4646                          zSig1
4647                      )
4648                   && increment
4649                 )
4650            ) {
4651             float_raise(float_flag_overflow | float_flag_inexact, status);
4652             if (    ( roundingMode == float_round_to_zero )
4653                  || ( zSign && ( roundingMode == float_round_up ) )
4654                  || ( ! zSign && ( roundingMode == float_round_down ) )
4655                  || (roundingMode == float_round_to_odd)
4656                ) {
4657                 return
4658                     packFloat128(
4659                         zSign,
4660                         0x7FFE,
4661                         UINT64_C(0x0000FFFFFFFFFFFF),
4662                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4663                     );
4664             }
4665             return packFloat128( zSign, 0x7FFF, 0, 0 );
4666         }
4667         if ( zExp < 0 ) {
4668             if (status->flush_to_zero) {
4669                 float_raise(float_flag_output_denormal, status);
4670                 return packFloat128(zSign, 0, 0, 0);
4671             }
4672             isTiny = status->tininess_before_rounding
4673                   || (zExp < -1)
4674                   || !increment
4675                   || lt128(zSig0, zSig1,
4676                            UINT64_C(0x0001FFFFFFFFFFFF),
4677                            UINT64_C(0xFFFFFFFFFFFFFFFF));
4678             shift128ExtraRightJamming(
4679                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4680             zExp = 0;
4681             if (isTiny && zSig2) {
4682                 float_raise(float_flag_underflow, status);
4683             }
4684             switch (roundingMode) {
4685             case float_round_nearest_even:
4686             case float_round_ties_away:
4687                 increment = ((int64_t)zSig2 < 0);
4688                 break;
4689             case float_round_to_zero:
4690                 increment = 0;
4691                 break;
4692             case float_round_up:
4693                 increment = !zSign && zSig2;
4694                 break;
4695             case float_round_down:
4696                 increment = zSign && zSig2;
4697                 break;
4698             case float_round_to_odd:
4699                 increment = !(zSig1 & 0x1) && zSig2;
4700                 break;
4701             default:
4702                 abort();
4703             }
4704         }
4705     }
4706     if (zSig2) {
4707         status->float_exception_flags |= float_flag_inexact;
4708     }
4709     if ( increment ) {
4710         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4711         if ((zSig2 + zSig2 == 0) && roundNearestEven) {
4712             zSig1 &= ~1;
4713         }
4714     }
4715     else {
4716         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4717     }
4718     return packFloat128( zSign, zExp, zSig0, zSig1 );
4719 
4720 }
4721 
4722 /*----------------------------------------------------------------------------
4723 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4724 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4725 | returns the proper quadruple-precision floating-point value corresponding
4726 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4727 | except that the input significand has fewer bits and does not have to be
4728 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4729 | point exponent.
4730 *----------------------------------------------------------------------------*/
4731 
4732 static float128 normalizeRoundAndPackFloat128(bool zSign, int32_t zExp,
4733                                               uint64_t zSig0, uint64_t zSig1,
4734                                               float_status *status)
4735 {
4736     int8_t shiftCount;
4737     uint64_t zSig2;
4738 
4739     if ( zSig0 == 0 ) {
4740         zSig0 = zSig1;
4741         zSig1 = 0;
4742         zExp -= 64;
4743     }
4744     shiftCount = clz64(zSig0) - 15;
4745     if ( 0 <= shiftCount ) {
4746         zSig2 = 0;
4747         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4748     }
4749     else {
4750         shift128ExtraRightJamming(
4751             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4752     }
4753     zExp -= shiftCount;
4754     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4755 
4756 }
4757 
4758 
4759 /*----------------------------------------------------------------------------
4760 | Returns the result of converting the 32-bit two's complement integer `a'
4761 | to the extended double-precision floating-point format.  The conversion
4762 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4763 | Arithmetic.
4764 *----------------------------------------------------------------------------*/
4765 
4766 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4767 {
4768     bool zSign;
4769     uint32_t absA;
4770     int8_t shiftCount;
4771     uint64_t zSig;
4772 
4773     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4774     zSign = ( a < 0 );
4775     absA = zSign ? - a : a;
4776     shiftCount = clz32(absA) + 32;
4777     zSig = absA;
4778     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4779 
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of converting the 32-bit two's complement integer `a' to
4784 | the quadruple-precision floating-point format.  The conversion is performed
4785 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787 
4788 float128 int32_to_float128(int32_t a, float_status *status)
4789 {
4790     bool zSign;
4791     uint32_t absA;
4792     int8_t shiftCount;
4793     uint64_t zSig0;
4794 
4795     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4796     zSign = ( a < 0 );
4797     absA = zSign ? - a : a;
4798     shiftCount = clz32(absA) + 17;
4799     zSig0 = absA;
4800     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4801 
4802 }
4803 
4804 /*----------------------------------------------------------------------------
4805 | Returns the result of converting the 64-bit two's complement integer `a'
4806 | to the extended double-precision floating-point format.  The conversion
4807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4808 | Arithmetic.
4809 *----------------------------------------------------------------------------*/
4810 
4811 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4812 {
4813     bool zSign;
4814     uint64_t absA;
4815     int8_t shiftCount;
4816 
4817     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4818     zSign = ( a < 0 );
4819     absA = zSign ? - a : a;
4820     shiftCount = clz64(absA);
4821     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4822 
4823 }
4824 
4825 /*----------------------------------------------------------------------------
4826 | Returns the result of converting the 64-bit two's complement integer `a' to
4827 | the quadruple-precision floating-point format.  The conversion is performed
4828 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4830 
4831 float128 int64_to_float128(int64_t a, float_status *status)
4832 {
4833     bool zSign;
4834     uint64_t absA;
4835     int8_t shiftCount;
4836     int32_t zExp;
4837     uint64_t zSig0, zSig1;
4838 
4839     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4840     zSign = ( a < 0 );
4841     absA = zSign ? - a : a;
4842     shiftCount = clz64(absA) + 49;
4843     zExp = 0x406E - shiftCount;
4844     if ( 64 <= shiftCount ) {
4845         zSig1 = 0;
4846         zSig0 = absA;
4847         shiftCount -= 64;
4848     }
4849     else {
4850         zSig1 = absA;
4851         zSig0 = 0;
4852     }
4853     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4854     return packFloat128( zSign, zExp, zSig0, zSig1 );
4855 
4856 }
4857 
4858 /*----------------------------------------------------------------------------
4859 | Returns the result of converting the 64-bit unsigned integer `a'
4860 | to the quadruple-precision floating-point format.  The conversion is performed
4861 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4862 *----------------------------------------------------------------------------*/
4863 
4864 float128 uint64_to_float128(uint64_t a, float_status *status)
4865 {
4866     if (a == 0) {
4867         return float128_zero;
4868     }
4869     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4870 }
4871 
4872 /*----------------------------------------------------------------------------
4873 | Returns the result of converting the single-precision floating-point value
4874 | `a' to the extended double-precision floating-point format.  The conversion
4875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4876 | Arithmetic.
4877 *----------------------------------------------------------------------------*/
4878 
4879 floatx80 float32_to_floatx80(float32 a, float_status *status)
4880 {
4881     bool aSign;
4882     int aExp;
4883     uint32_t aSig;
4884 
4885     a = float32_squash_input_denormal(a, status);
4886     aSig = extractFloat32Frac( a );
4887     aExp = extractFloat32Exp( a );
4888     aSign = extractFloat32Sign( a );
4889     if ( aExp == 0xFF ) {
4890         if (aSig) {
4891             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4892                                                status);
4893             return floatx80_silence_nan(res, status);
4894         }
4895         return packFloatx80(aSign,
4896                             floatx80_infinity_high,
4897                             floatx80_infinity_low);
4898     }
4899     if ( aExp == 0 ) {
4900         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4901         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4902     }
4903     aSig |= 0x00800000;
4904     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4905 
4906 }
4907 
4908 /*----------------------------------------------------------------------------
4909 | Returns the result of converting the single-precision floating-point value
4910 | `a' to the double-precision floating-point format.  The conversion is
4911 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4912 | Arithmetic.
4913 *----------------------------------------------------------------------------*/
4914 
4915 float128 float32_to_float128(float32 a, float_status *status)
4916 {
4917     bool aSign;
4918     int aExp;
4919     uint32_t aSig;
4920 
4921     a = float32_squash_input_denormal(a, status);
4922     aSig = extractFloat32Frac( a );
4923     aExp = extractFloat32Exp( a );
4924     aSign = extractFloat32Sign( a );
4925     if ( aExp == 0xFF ) {
4926         if (aSig) {
4927             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4928         }
4929         return packFloat128( aSign, 0x7FFF, 0, 0 );
4930     }
4931     if ( aExp == 0 ) {
4932         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4933         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4934         --aExp;
4935     }
4936     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4937 
4938 }
4939 
4940 /*----------------------------------------------------------------------------
4941 | Returns the remainder of the single-precision floating-point value `a'
4942 | with respect to the corresponding value `b'.  The operation is performed
4943 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4944 *----------------------------------------------------------------------------*/
4945 
4946 float32 float32_rem(float32 a, float32 b, float_status *status)
4947 {
4948     bool aSign, zSign;
4949     int aExp, bExp, expDiff;
4950     uint32_t aSig, bSig;
4951     uint32_t q;
4952     uint64_t aSig64, bSig64, q64;
4953     uint32_t alternateASig;
4954     int32_t sigMean;
4955     a = float32_squash_input_denormal(a, status);
4956     b = float32_squash_input_denormal(b, status);
4957 
4958     aSig = extractFloat32Frac( a );
4959     aExp = extractFloat32Exp( a );
4960     aSign = extractFloat32Sign( a );
4961     bSig = extractFloat32Frac( b );
4962     bExp = extractFloat32Exp( b );
4963     if ( aExp == 0xFF ) {
4964         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4965             return propagateFloat32NaN(a, b, status);
4966         }
4967         float_raise(float_flag_invalid, status);
4968         return float32_default_nan(status);
4969     }
4970     if ( bExp == 0xFF ) {
4971         if (bSig) {
4972             return propagateFloat32NaN(a, b, status);
4973         }
4974         return a;
4975     }
4976     if ( bExp == 0 ) {
4977         if ( bSig == 0 ) {
4978             float_raise(float_flag_invalid, status);
4979             return float32_default_nan(status);
4980         }
4981         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4982     }
4983     if ( aExp == 0 ) {
4984         if ( aSig == 0 ) return a;
4985         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4986     }
4987     expDiff = aExp - bExp;
4988     aSig |= 0x00800000;
4989     bSig |= 0x00800000;
4990     if ( expDiff < 32 ) {
4991         aSig <<= 8;
4992         bSig <<= 8;
4993         if ( expDiff < 0 ) {
4994             if ( expDiff < -1 ) return a;
4995             aSig >>= 1;
4996         }
4997         q = ( bSig <= aSig );
4998         if ( q ) aSig -= bSig;
4999         if ( 0 < expDiff ) {
5000             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
5001             q >>= 32 - expDiff;
5002             bSig >>= 2;
5003             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5004         }
5005         else {
5006             aSig >>= 2;
5007             bSig >>= 2;
5008         }
5009     }
5010     else {
5011         if ( bSig <= aSig ) aSig -= bSig;
5012         aSig64 = ( (uint64_t) aSig )<<40;
5013         bSig64 = ( (uint64_t) bSig )<<40;
5014         expDiff -= 64;
5015         while ( 0 < expDiff ) {
5016             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5017             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5018             aSig64 = - ( ( bSig * q64 )<<38 );
5019             expDiff -= 62;
5020         }
5021         expDiff += 64;
5022         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
5023         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
5024         q = q64>>( 64 - expDiff );
5025         bSig <<= 6;
5026         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
5027     }
5028     do {
5029         alternateASig = aSig;
5030         ++q;
5031         aSig -= bSig;
5032     } while ( 0 <= (int32_t) aSig );
5033     sigMean = aSig + alternateASig;
5034     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5035         aSig = alternateASig;
5036     }
5037     zSign = ( (int32_t) aSig < 0 );
5038     if ( zSign ) aSig = - aSig;
5039     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
5040 }
5041 
5042 
5043 
5044 /*----------------------------------------------------------------------------
5045 | Returns the binary exponential of the single-precision floating-point value
5046 | `a'. The operation is performed according to the IEC/IEEE Standard for
5047 | Binary Floating-Point Arithmetic.
5048 |
5049 | Uses the following identities:
5050 |
5051 | 1. -------------------------------------------------------------------------
5052 |      x    x*ln(2)
5053 |     2  = e
5054 |
5055 | 2. -------------------------------------------------------------------------
5056 |                      2     3     4     5           n
5057 |      x        x     x     x     x     x           x
5058 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
5059 |               1!    2!    3!    4!    5!          n!
5060 *----------------------------------------------------------------------------*/
5061 
5062 static const float64 float32_exp2_coefficients[15] =
5063 {
5064     const_float64( 0x3ff0000000000000ll ), /*  1 */
5065     const_float64( 0x3fe0000000000000ll ), /*  2 */
5066     const_float64( 0x3fc5555555555555ll ), /*  3 */
5067     const_float64( 0x3fa5555555555555ll ), /*  4 */
5068     const_float64( 0x3f81111111111111ll ), /*  5 */
5069     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
5070     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
5071     const_float64( 0x3efa01a01a01a01all ), /*  8 */
5072     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
5073     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
5074     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
5075     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
5076     const_float64( 0x3de6124613a86d09ll ), /* 13 */
5077     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
5078     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
5079 };
5080 
5081 float32 float32_exp2(float32 a, float_status *status)
5082 {
5083     bool aSign;
5084     int aExp;
5085     uint32_t aSig;
5086     float64 r, x, xn;
5087     int i;
5088     a = float32_squash_input_denormal(a, status);
5089 
5090     aSig = extractFloat32Frac( a );
5091     aExp = extractFloat32Exp( a );
5092     aSign = extractFloat32Sign( a );
5093 
5094     if ( aExp == 0xFF) {
5095         if (aSig) {
5096             return propagateFloat32NaN(a, float32_zero, status);
5097         }
5098         return (aSign) ? float32_zero : a;
5099     }
5100     if (aExp == 0) {
5101         if (aSig == 0) return float32_one;
5102     }
5103 
5104     float_raise(float_flag_inexact, status);
5105 
5106     /* ******************************* */
5107     /* using float64 for approximation */
5108     /* ******************************* */
5109     x = float32_to_float64(a, status);
5110     x = float64_mul(x, float64_ln2, status);
5111 
5112     xn = x;
5113     r = float64_one;
5114     for (i = 0 ; i < 15 ; i++) {
5115         float64 f;
5116 
5117         f = float64_mul(xn, float32_exp2_coefficients[i], status);
5118         r = float64_add(r, f, status);
5119 
5120         xn = float64_mul(xn, x, status);
5121     }
5122 
5123     return float64_to_float32(r, status);
5124 }
5125 
5126 /*----------------------------------------------------------------------------
5127 | Returns the binary log of the single-precision floating-point value `a'.
5128 | The operation is performed according to the IEC/IEEE Standard for Binary
5129 | Floating-Point Arithmetic.
5130 *----------------------------------------------------------------------------*/
5131 float32 float32_log2(float32 a, float_status *status)
5132 {
5133     bool aSign, zSign;
5134     int aExp;
5135     uint32_t aSig, zSig, i;
5136 
5137     a = float32_squash_input_denormal(a, status);
5138     aSig = extractFloat32Frac( a );
5139     aExp = extractFloat32Exp( a );
5140     aSign = extractFloat32Sign( a );
5141 
5142     if ( aExp == 0 ) {
5143         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
5144         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
5145     }
5146     if ( aSign ) {
5147         float_raise(float_flag_invalid, status);
5148         return float32_default_nan(status);
5149     }
5150     if ( aExp == 0xFF ) {
5151         if (aSig) {
5152             return propagateFloat32NaN(a, float32_zero, status);
5153         }
5154         return a;
5155     }
5156 
5157     aExp -= 0x7F;
5158     aSig |= 0x00800000;
5159     zSign = aExp < 0;
5160     zSig = aExp << 23;
5161 
5162     for (i = 1 << 22; i > 0; i >>= 1) {
5163         aSig = ( (uint64_t)aSig * aSig ) >> 23;
5164         if ( aSig & 0x01000000 ) {
5165             aSig >>= 1;
5166             zSig |= i;
5167         }
5168     }
5169 
5170     if ( zSign )
5171         zSig = -zSig;
5172 
5173     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
5174 }
5175 
5176 /*----------------------------------------------------------------------------
5177 | Returns the result of converting the double-precision floating-point value
5178 | `a' to the extended double-precision floating-point format.  The conversion
5179 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5180 | Arithmetic.
5181 *----------------------------------------------------------------------------*/
5182 
5183 floatx80 float64_to_floatx80(float64 a, float_status *status)
5184 {
5185     bool aSign;
5186     int aExp;
5187     uint64_t aSig;
5188 
5189     a = float64_squash_input_denormal(a, status);
5190     aSig = extractFloat64Frac( a );
5191     aExp = extractFloat64Exp( a );
5192     aSign = extractFloat64Sign( a );
5193     if ( aExp == 0x7FF ) {
5194         if (aSig) {
5195             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5196                                                status);
5197             return floatx80_silence_nan(res, status);
5198         }
5199         return packFloatx80(aSign,
5200                             floatx80_infinity_high,
5201                             floatx80_infinity_low);
5202     }
5203     if ( aExp == 0 ) {
5204         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5205         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5206     }
5207     return
5208         packFloatx80(
5209             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5210 
5211 }
5212 
5213 /*----------------------------------------------------------------------------
5214 | Returns the result of converting the double-precision floating-point value
5215 | `a' to the quadruple-precision floating-point format.  The conversion is
5216 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5217 | Arithmetic.
5218 *----------------------------------------------------------------------------*/
5219 
5220 float128 float64_to_float128(float64 a, float_status *status)
5221 {
5222     bool aSign;
5223     int aExp;
5224     uint64_t aSig, zSig0, zSig1;
5225 
5226     a = float64_squash_input_denormal(a, status);
5227     aSig = extractFloat64Frac( a );
5228     aExp = extractFloat64Exp( a );
5229     aSign = extractFloat64Sign( a );
5230     if ( aExp == 0x7FF ) {
5231         if (aSig) {
5232             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5233         }
5234         return packFloat128( aSign, 0x7FFF, 0, 0 );
5235     }
5236     if ( aExp == 0 ) {
5237         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5238         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5239         --aExp;
5240     }
5241     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5242     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5243 
5244 }
5245 
5246 
5247 /*----------------------------------------------------------------------------
5248 | Returns the remainder of the double-precision floating-point value `a'
5249 | with respect to the corresponding value `b'.  The operation is performed
5250 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5251 *----------------------------------------------------------------------------*/
5252 
5253 float64 float64_rem(float64 a, float64 b, float_status *status)
5254 {
5255     bool aSign, zSign;
5256     int aExp, bExp, expDiff;
5257     uint64_t aSig, bSig;
5258     uint64_t q, alternateASig;
5259     int64_t sigMean;
5260 
5261     a = float64_squash_input_denormal(a, status);
5262     b = float64_squash_input_denormal(b, status);
5263     aSig = extractFloat64Frac( a );
5264     aExp = extractFloat64Exp( a );
5265     aSign = extractFloat64Sign( a );
5266     bSig = extractFloat64Frac( b );
5267     bExp = extractFloat64Exp( b );
5268     if ( aExp == 0x7FF ) {
5269         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5270             return propagateFloat64NaN(a, b, status);
5271         }
5272         float_raise(float_flag_invalid, status);
5273         return float64_default_nan(status);
5274     }
5275     if ( bExp == 0x7FF ) {
5276         if (bSig) {
5277             return propagateFloat64NaN(a, b, status);
5278         }
5279         return a;
5280     }
5281     if ( bExp == 0 ) {
5282         if ( bSig == 0 ) {
5283             float_raise(float_flag_invalid, status);
5284             return float64_default_nan(status);
5285         }
5286         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5287     }
5288     if ( aExp == 0 ) {
5289         if ( aSig == 0 ) return a;
5290         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5291     }
5292     expDiff = aExp - bExp;
5293     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5294     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5295     if ( expDiff < 0 ) {
5296         if ( expDiff < -1 ) return a;
5297         aSig >>= 1;
5298     }
5299     q = ( bSig <= aSig );
5300     if ( q ) aSig -= bSig;
5301     expDiff -= 64;
5302     while ( 0 < expDiff ) {
5303         q = estimateDiv128To64( aSig, 0, bSig );
5304         q = ( 2 < q ) ? q - 2 : 0;
5305         aSig = - ( ( bSig>>2 ) * q );
5306         expDiff -= 62;
5307     }
5308     expDiff += 64;
5309     if ( 0 < expDiff ) {
5310         q = estimateDiv128To64( aSig, 0, bSig );
5311         q = ( 2 < q ) ? q - 2 : 0;
5312         q >>= 64 - expDiff;
5313         bSig >>= 2;
5314         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5315     }
5316     else {
5317         aSig >>= 2;
5318         bSig >>= 2;
5319     }
5320     do {
5321         alternateASig = aSig;
5322         ++q;
5323         aSig -= bSig;
5324     } while ( 0 <= (int64_t) aSig );
5325     sigMean = aSig + alternateASig;
5326     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5327         aSig = alternateASig;
5328     }
5329     zSign = ( (int64_t) aSig < 0 );
5330     if ( zSign ) aSig = - aSig;
5331     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5332 
5333 }
5334 
5335 /*----------------------------------------------------------------------------
5336 | Returns the binary log of the double-precision floating-point value `a'.
5337 | The operation is performed according to the IEC/IEEE Standard for Binary
5338 | Floating-Point Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340 float64 float64_log2(float64 a, float_status *status)
5341 {
5342     bool aSign, zSign;
5343     int aExp;
5344     uint64_t aSig, aSig0, aSig1, zSig, i;
5345     a = float64_squash_input_denormal(a, status);
5346 
5347     aSig = extractFloat64Frac( a );
5348     aExp = extractFloat64Exp( a );
5349     aSign = extractFloat64Sign( a );
5350 
5351     if ( aExp == 0 ) {
5352         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5353         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5354     }
5355     if ( aSign ) {
5356         float_raise(float_flag_invalid, status);
5357         return float64_default_nan(status);
5358     }
5359     if ( aExp == 0x7FF ) {
5360         if (aSig) {
5361             return propagateFloat64NaN(a, float64_zero, status);
5362         }
5363         return a;
5364     }
5365 
5366     aExp -= 0x3FF;
5367     aSig |= UINT64_C(0x0010000000000000);
5368     zSign = aExp < 0;
5369     zSig = (uint64_t)aExp << 52;
5370     for (i = 1LL << 51; i > 0; i >>= 1) {
5371         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5372         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5373         if ( aSig & UINT64_C(0x0020000000000000) ) {
5374             aSig >>= 1;
5375             zSig |= i;
5376         }
5377     }
5378 
5379     if ( zSign )
5380         zSig = -zSig;
5381     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5382 }
5383 
5384 /*----------------------------------------------------------------------------
5385 | Returns the result of converting the extended double-precision floating-
5386 | point value `a' to the 32-bit two's complement integer format.  The
5387 | conversion is performed according to the IEC/IEEE Standard for Binary
5388 | Floating-Point Arithmetic---which means in particular that the conversion
5389 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5390 | largest positive integer is returned.  Otherwise, if the conversion
5391 | overflows, the largest integer with the same sign as `a' is returned.
5392 *----------------------------------------------------------------------------*/
5393 
5394 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5395 {
5396     bool aSign;
5397     int32_t aExp, shiftCount;
5398     uint64_t aSig;
5399 
5400     if (floatx80_invalid_encoding(a)) {
5401         float_raise(float_flag_invalid, status);
5402         return 1 << 31;
5403     }
5404     aSig = extractFloatx80Frac( a );
5405     aExp = extractFloatx80Exp( a );
5406     aSign = extractFloatx80Sign( a );
5407     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5408     shiftCount = 0x4037 - aExp;
5409     if ( shiftCount <= 0 ) shiftCount = 1;
5410     shift64RightJamming( aSig, shiftCount, &aSig );
5411     return roundAndPackInt32(aSign, aSig, status);
5412 
5413 }
5414 
5415 /*----------------------------------------------------------------------------
5416 | Returns the result of converting the extended double-precision floating-
5417 | point value `a' to the 32-bit two's complement integer format.  The
5418 | conversion is performed according to the IEC/IEEE Standard for Binary
5419 | Floating-Point Arithmetic, except that the conversion is always rounded
5420 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5421 | Otherwise, if the conversion overflows, the largest integer with the same
5422 | sign as `a' is returned.
5423 *----------------------------------------------------------------------------*/
5424 
5425 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5426 {
5427     bool aSign;
5428     int32_t aExp, shiftCount;
5429     uint64_t aSig, savedASig;
5430     int32_t z;
5431 
5432     if (floatx80_invalid_encoding(a)) {
5433         float_raise(float_flag_invalid, status);
5434         return 1 << 31;
5435     }
5436     aSig = extractFloatx80Frac( a );
5437     aExp = extractFloatx80Exp( a );
5438     aSign = extractFloatx80Sign( a );
5439     if ( 0x401E < aExp ) {
5440         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5441         goto invalid;
5442     }
5443     else if ( aExp < 0x3FFF ) {
5444         if (aExp || aSig) {
5445             status->float_exception_flags |= float_flag_inexact;
5446         }
5447         return 0;
5448     }
5449     shiftCount = 0x403E - aExp;
5450     savedASig = aSig;
5451     aSig >>= shiftCount;
5452     z = aSig;
5453     if ( aSign ) z = - z;
5454     if ( ( z < 0 ) ^ aSign ) {
5455  invalid:
5456         float_raise(float_flag_invalid, status);
5457         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5458     }
5459     if ( ( aSig<<shiftCount ) != savedASig ) {
5460         status->float_exception_flags |= float_flag_inexact;
5461     }
5462     return z;
5463 
5464 }
5465 
5466 /*----------------------------------------------------------------------------
5467 | Returns the result of converting the extended double-precision floating-
5468 | point value `a' to the 64-bit two's complement integer format.  The
5469 | conversion is performed according to the IEC/IEEE Standard for Binary
5470 | Floating-Point Arithmetic---which means in particular that the conversion
5471 | is rounded according to the current rounding mode.  If `a' is a NaN,
5472 | the largest positive integer is returned.  Otherwise, if the conversion
5473 | overflows, the largest integer with the same sign as `a' is returned.
5474 *----------------------------------------------------------------------------*/
5475 
5476 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5477 {
5478     bool aSign;
5479     int32_t aExp, shiftCount;
5480     uint64_t aSig, aSigExtra;
5481 
5482     if (floatx80_invalid_encoding(a)) {
5483         float_raise(float_flag_invalid, status);
5484         return 1ULL << 63;
5485     }
5486     aSig = extractFloatx80Frac( a );
5487     aExp = extractFloatx80Exp( a );
5488     aSign = extractFloatx80Sign( a );
5489     shiftCount = 0x403E - aExp;
5490     if ( shiftCount <= 0 ) {
5491         if ( shiftCount ) {
5492             float_raise(float_flag_invalid, status);
5493             if (!aSign || floatx80_is_any_nan(a)) {
5494                 return INT64_MAX;
5495             }
5496             return INT64_MIN;
5497         }
5498         aSigExtra = 0;
5499     }
5500     else {
5501         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5502     }
5503     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5504 
5505 }
5506 
5507 /*----------------------------------------------------------------------------
5508 | Returns the result of converting the extended double-precision floating-
5509 | point value `a' to the 64-bit two's complement integer format.  The
5510 | conversion is performed according to the IEC/IEEE Standard for Binary
5511 | Floating-Point Arithmetic, except that the conversion is always rounded
5512 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5513 | Otherwise, if the conversion overflows, the largest integer with the same
5514 | sign as `a' is returned.
5515 *----------------------------------------------------------------------------*/
5516 
5517 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5518 {
5519     bool aSign;
5520     int32_t aExp, shiftCount;
5521     uint64_t aSig;
5522     int64_t z;
5523 
5524     if (floatx80_invalid_encoding(a)) {
5525         float_raise(float_flag_invalid, status);
5526         return 1ULL << 63;
5527     }
5528     aSig = extractFloatx80Frac( a );
5529     aExp = extractFloatx80Exp( a );
5530     aSign = extractFloatx80Sign( a );
5531     shiftCount = aExp - 0x403E;
5532     if ( 0 <= shiftCount ) {
5533         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5534         if ( ( a.high != 0xC03E ) || aSig ) {
5535             float_raise(float_flag_invalid, status);
5536             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5537                 return INT64_MAX;
5538             }
5539         }
5540         return INT64_MIN;
5541     }
5542     else if ( aExp < 0x3FFF ) {
5543         if (aExp | aSig) {
5544             status->float_exception_flags |= float_flag_inexact;
5545         }
5546         return 0;
5547     }
5548     z = aSig>>( - shiftCount );
5549     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5550         status->float_exception_flags |= float_flag_inexact;
5551     }
5552     if ( aSign ) z = - z;
5553     return z;
5554 
5555 }
5556 
5557 /*----------------------------------------------------------------------------
5558 | Returns the result of converting the extended double-precision floating-
5559 | point value `a' to the single-precision floating-point format.  The
5560 | conversion is performed according to the IEC/IEEE Standard for Binary
5561 | Floating-Point Arithmetic.
5562 *----------------------------------------------------------------------------*/
5563 
5564 float32 floatx80_to_float32(floatx80 a, float_status *status)
5565 {
5566     bool aSign;
5567     int32_t aExp;
5568     uint64_t aSig;
5569 
5570     if (floatx80_invalid_encoding(a)) {
5571         float_raise(float_flag_invalid, status);
5572         return float32_default_nan(status);
5573     }
5574     aSig = extractFloatx80Frac( a );
5575     aExp = extractFloatx80Exp( a );
5576     aSign = extractFloatx80Sign( a );
5577     if ( aExp == 0x7FFF ) {
5578         if ( (uint64_t) ( aSig<<1 ) ) {
5579             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5580                                              status);
5581             return float32_silence_nan(res, status);
5582         }
5583         return packFloat32( aSign, 0xFF, 0 );
5584     }
5585     shift64RightJamming( aSig, 33, &aSig );
5586     if ( aExp || aSig ) aExp -= 0x3F81;
5587     return roundAndPackFloat32(aSign, aExp, aSig, status);
5588 
5589 }
5590 
5591 /*----------------------------------------------------------------------------
5592 | Returns the result of converting the extended double-precision floating-
5593 | point value `a' to the double-precision floating-point format.  The
5594 | conversion is performed according to the IEC/IEEE Standard for Binary
5595 | Floating-Point Arithmetic.
5596 *----------------------------------------------------------------------------*/
5597 
5598 float64 floatx80_to_float64(floatx80 a, float_status *status)
5599 {
5600     bool aSign;
5601     int32_t aExp;
5602     uint64_t aSig, zSig;
5603 
5604     if (floatx80_invalid_encoding(a)) {
5605         float_raise(float_flag_invalid, status);
5606         return float64_default_nan(status);
5607     }
5608     aSig = extractFloatx80Frac( a );
5609     aExp = extractFloatx80Exp( a );
5610     aSign = extractFloatx80Sign( a );
5611     if ( aExp == 0x7FFF ) {
5612         if ( (uint64_t) ( aSig<<1 ) ) {
5613             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5614                                              status);
5615             return float64_silence_nan(res, status);
5616         }
5617         return packFloat64( aSign, 0x7FF, 0 );
5618     }
5619     shift64RightJamming( aSig, 1, &zSig );
5620     if ( aExp || aSig ) aExp -= 0x3C01;
5621     return roundAndPackFloat64(aSign, aExp, zSig, status);
5622 
5623 }
5624 
5625 /*----------------------------------------------------------------------------
5626 | Returns the result of converting the extended double-precision floating-
5627 | point value `a' to the quadruple-precision floating-point format.  The
5628 | conversion is performed according to the IEC/IEEE Standard for Binary
5629 | Floating-Point Arithmetic.
5630 *----------------------------------------------------------------------------*/
5631 
5632 float128 floatx80_to_float128(floatx80 a, float_status *status)
5633 {
5634     bool aSign;
5635     int aExp;
5636     uint64_t aSig, zSig0, zSig1;
5637 
5638     if (floatx80_invalid_encoding(a)) {
5639         float_raise(float_flag_invalid, status);
5640         return float128_default_nan(status);
5641     }
5642     aSig = extractFloatx80Frac( a );
5643     aExp = extractFloatx80Exp( a );
5644     aSign = extractFloatx80Sign( a );
5645     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5646         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5647                                            status);
5648         return float128_silence_nan(res, status);
5649     }
5650     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5651     return packFloat128( aSign, aExp, zSig0, zSig1 );
5652 
5653 }
5654 
5655 /*----------------------------------------------------------------------------
5656 | Rounds the extended double-precision floating-point value `a'
5657 | to the precision provided by floatx80_rounding_precision and returns the
5658 | result as an extended double-precision floating-point value.
5659 | The operation is performed according to the IEC/IEEE Standard for Binary
5660 | Floating-Point Arithmetic.
5661 *----------------------------------------------------------------------------*/
5662 
5663 floatx80 floatx80_round(floatx80 a, float_status *status)
5664 {
5665     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5666                                 extractFloatx80Sign(a),
5667                                 extractFloatx80Exp(a),
5668                                 extractFloatx80Frac(a), 0, status);
5669 }
5670 
5671 /*----------------------------------------------------------------------------
5672 | Rounds the extended double-precision floating-point value `a' to an integer,
5673 | and returns the result as an extended quadruple-precision floating-point
5674 | value.  The operation is performed according to the IEC/IEEE Standard for
5675 | Binary Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5677 
5678 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5679 {
5680     bool aSign;
5681     int32_t aExp;
5682     uint64_t lastBitMask, roundBitsMask;
5683     floatx80 z;
5684 
5685     if (floatx80_invalid_encoding(a)) {
5686         float_raise(float_flag_invalid, status);
5687         return floatx80_default_nan(status);
5688     }
5689     aExp = extractFloatx80Exp( a );
5690     if ( 0x403E <= aExp ) {
5691         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5692             return propagateFloatx80NaN(a, a, status);
5693         }
5694         return a;
5695     }
5696     if ( aExp < 0x3FFF ) {
5697         if (    ( aExp == 0 )
5698              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5699             return a;
5700         }
5701         status->float_exception_flags |= float_flag_inexact;
5702         aSign = extractFloatx80Sign( a );
5703         switch (status->float_rounding_mode) {
5704          case float_round_nearest_even:
5705             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5706                ) {
5707                 return
5708                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5709             }
5710             break;
5711         case float_round_ties_away:
5712             if (aExp == 0x3FFE) {
5713                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5714             }
5715             break;
5716          case float_round_down:
5717             return
5718                   aSign ?
5719                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5720                 : packFloatx80( 0, 0, 0 );
5721          case float_round_up:
5722             return
5723                   aSign ? packFloatx80( 1, 0, 0 )
5724                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5725 
5726         case float_round_to_zero:
5727             break;
5728         default:
5729             g_assert_not_reached();
5730         }
5731         return packFloatx80( aSign, 0, 0 );
5732     }
5733     lastBitMask = 1;
5734     lastBitMask <<= 0x403E - aExp;
5735     roundBitsMask = lastBitMask - 1;
5736     z = a;
5737     switch (status->float_rounding_mode) {
5738     case float_round_nearest_even:
5739         z.low += lastBitMask>>1;
5740         if ((z.low & roundBitsMask) == 0) {
5741             z.low &= ~lastBitMask;
5742         }
5743         break;
5744     case float_round_ties_away:
5745         z.low += lastBitMask >> 1;
5746         break;
5747     case float_round_to_zero:
5748         break;
5749     case float_round_up:
5750         if (!extractFloatx80Sign(z)) {
5751             z.low += roundBitsMask;
5752         }
5753         break;
5754     case float_round_down:
5755         if (extractFloatx80Sign(z)) {
5756             z.low += roundBitsMask;
5757         }
5758         break;
5759     default:
5760         abort();
5761     }
5762     z.low &= ~ roundBitsMask;
5763     if ( z.low == 0 ) {
5764         ++z.high;
5765         z.low = UINT64_C(0x8000000000000000);
5766     }
5767     if (z.low != a.low) {
5768         status->float_exception_flags |= float_flag_inexact;
5769     }
5770     return z;
5771 
5772 }
5773 
5774 /*----------------------------------------------------------------------------
5775 | Returns the result of adding the absolute values of the extended double-
5776 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5777 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5778 | The addition is performed according to the IEC/IEEE Standard for Binary
5779 | Floating-Point Arithmetic.
5780 *----------------------------------------------------------------------------*/
5781 
5782 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5783                                 float_status *status)
5784 {
5785     int32_t aExp, bExp, zExp;
5786     uint64_t aSig, bSig, zSig0, zSig1;
5787     int32_t expDiff;
5788 
5789     aSig = extractFloatx80Frac( a );
5790     aExp = extractFloatx80Exp( a );
5791     bSig = extractFloatx80Frac( b );
5792     bExp = extractFloatx80Exp( b );
5793     expDiff = aExp - bExp;
5794     if ( 0 < expDiff ) {
5795         if ( aExp == 0x7FFF ) {
5796             if ((uint64_t)(aSig << 1)) {
5797                 return propagateFloatx80NaN(a, b, status);
5798             }
5799             return a;
5800         }
5801         if ( bExp == 0 ) --expDiff;
5802         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5803         zExp = aExp;
5804     }
5805     else if ( expDiff < 0 ) {
5806         if ( bExp == 0x7FFF ) {
5807             if ((uint64_t)(bSig << 1)) {
5808                 return propagateFloatx80NaN(a, b, status);
5809             }
5810             return packFloatx80(zSign,
5811                                 floatx80_infinity_high,
5812                                 floatx80_infinity_low);
5813         }
5814         if ( aExp == 0 ) ++expDiff;
5815         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5816         zExp = bExp;
5817     }
5818     else {
5819         if ( aExp == 0x7FFF ) {
5820             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5821                 return propagateFloatx80NaN(a, b, status);
5822             }
5823             return a;
5824         }
5825         zSig1 = 0;
5826         zSig0 = aSig + bSig;
5827         if ( aExp == 0 ) {
5828             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5829                 /* At least one of the values is a pseudo-denormal,
5830                  * and there is a carry out of the result.  */
5831                 zExp = 1;
5832                 goto shiftRight1;
5833             }
5834             if (zSig0 == 0) {
5835                 return packFloatx80(zSign, 0, 0);
5836             }
5837             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5838             goto roundAndPack;
5839         }
5840         zExp = aExp;
5841         goto shiftRight1;
5842     }
5843     zSig0 = aSig + bSig;
5844     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5845  shiftRight1:
5846     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5847     zSig0 |= UINT64_C(0x8000000000000000);
5848     ++zExp;
5849  roundAndPack:
5850     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5851                                 zSign, zExp, zSig0, zSig1, status);
5852 }
5853 
5854 /*----------------------------------------------------------------------------
5855 | Returns the result of subtracting the absolute values of the extended
5856 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5857 | difference is negated before being returned.  `zSign' is ignored if the
5858 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5859 | Standard for Binary Floating-Point Arithmetic.
5860 *----------------------------------------------------------------------------*/
5861 
5862 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, bool zSign,
5863                                 float_status *status)
5864 {
5865     int32_t aExp, bExp, zExp;
5866     uint64_t aSig, bSig, zSig0, zSig1;
5867     int32_t expDiff;
5868 
5869     aSig = extractFloatx80Frac( a );
5870     aExp = extractFloatx80Exp( a );
5871     bSig = extractFloatx80Frac( b );
5872     bExp = extractFloatx80Exp( b );
5873     expDiff = aExp - bExp;
5874     if ( 0 < expDiff ) goto aExpBigger;
5875     if ( expDiff < 0 ) goto bExpBigger;
5876     if ( aExp == 0x7FFF ) {
5877         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5878             return propagateFloatx80NaN(a, b, status);
5879         }
5880         float_raise(float_flag_invalid, status);
5881         return floatx80_default_nan(status);
5882     }
5883     if ( aExp == 0 ) {
5884         aExp = 1;
5885         bExp = 1;
5886     }
5887     zSig1 = 0;
5888     if ( bSig < aSig ) goto aBigger;
5889     if ( aSig < bSig ) goto bBigger;
5890     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5891  bExpBigger:
5892     if ( bExp == 0x7FFF ) {
5893         if ((uint64_t)(bSig << 1)) {
5894             return propagateFloatx80NaN(a, b, status);
5895         }
5896         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5897                             floatx80_infinity_low);
5898     }
5899     if ( aExp == 0 ) ++expDiff;
5900     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5901  bBigger:
5902     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5903     zExp = bExp;
5904     zSign ^= 1;
5905     goto normalizeRoundAndPack;
5906  aExpBigger:
5907     if ( aExp == 0x7FFF ) {
5908         if ((uint64_t)(aSig << 1)) {
5909             return propagateFloatx80NaN(a, b, status);
5910         }
5911         return a;
5912     }
5913     if ( bExp == 0 ) --expDiff;
5914     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5915  aBigger:
5916     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5917     zExp = aExp;
5918  normalizeRoundAndPack:
5919     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5920                                          zSign, zExp, zSig0, zSig1, status);
5921 }
5922 
5923 /*----------------------------------------------------------------------------
5924 | Returns the result of adding the extended double-precision floating-point
5925 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5926 | Standard for Binary Floating-Point Arithmetic.
5927 *----------------------------------------------------------------------------*/
5928 
5929 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5930 {
5931     bool aSign, bSign;
5932 
5933     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5934         float_raise(float_flag_invalid, status);
5935         return floatx80_default_nan(status);
5936     }
5937     aSign = extractFloatx80Sign( a );
5938     bSign = extractFloatx80Sign( b );
5939     if ( aSign == bSign ) {
5940         return addFloatx80Sigs(a, b, aSign, status);
5941     }
5942     else {
5943         return subFloatx80Sigs(a, b, aSign, status);
5944     }
5945 
5946 }
5947 
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of subtracting the extended double-precision floating-
5950 | point values `a' and `b'.  The operation is performed according to the
5951 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5952 *----------------------------------------------------------------------------*/
5953 
5954 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5955 {
5956     bool aSign, bSign;
5957 
5958     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5959         float_raise(float_flag_invalid, status);
5960         return floatx80_default_nan(status);
5961     }
5962     aSign = extractFloatx80Sign( a );
5963     bSign = extractFloatx80Sign( b );
5964     if ( aSign == bSign ) {
5965         return subFloatx80Sigs(a, b, aSign, status);
5966     }
5967     else {
5968         return addFloatx80Sigs(a, b, aSign, status);
5969     }
5970 
5971 }
5972 
5973 /*----------------------------------------------------------------------------
5974 | Returns the result of multiplying the extended double-precision floating-
5975 | point values `a' and `b'.  The operation is performed according to the
5976 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5977 *----------------------------------------------------------------------------*/
5978 
5979 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5980 {
5981     bool aSign, bSign, zSign;
5982     int32_t aExp, bExp, zExp;
5983     uint64_t aSig, bSig, zSig0, zSig1;
5984 
5985     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5986         float_raise(float_flag_invalid, status);
5987         return floatx80_default_nan(status);
5988     }
5989     aSig = extractFloatx80Frac( a );
5990     aExp = extractFloatx80Exp( a );
5991     aSign = extractFloatx80Sign( a );
5992     bSig = extractFloatx80Frac( b );
5993     bExp = extractFloatx80Exp( b );
5994     bSign = extractFloatx80Sign( b );
5995     zSign = aSign ^ bSign;
5996     if ( aExp == 0x7FFF ) {
5997         if (    (uint64_t) ( aSig<<1 )
5998              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5999             return propagateFloatx80NaN(a, b, status);
6000         }
6001         if ( ( bExp | bSig ) == 0 ) goto invalid;
6002         return packFloatx80(zSign, floatx80_infinity_high,
6003                                    floatx80_infinity_low);
6004     }
6005     if ( bExp == 0x7FFF ) {
6006         if ((uint64_t)(bSig << 1)) {
6007             return propagateFloatx80NaN(a, b, status);
6008         }
6009         if ( ( aExp | aSig ) == 0 ) {
6010  invalid:
6011             float_raise(float_flag_invalid, status);
6012             return floatx80_default_nan(status);
6013         }
6014         return packFloatx80(zSign, floatx80_infinity_high,
6015                                    floatx80_infinity_low);
6016     }
6017     if ( aExp == 0 ) {
6018         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6019         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6020     }
6021     if ( bExp == 0 ) {
6022         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6023         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6024     }
6025     zExp = aExp + bExp - 0x3FFE;
6026     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6027     if ( 0 < (int64_t) zSig0 ) {
6028         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6029         --zExp;
6030     }
6031     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6032                                 zSign, zExp, zSig0, zSig1, status);
6033 }
6034 
6035 /*----------------------------------------------------------------------------
6036 | Returns the result of dividing the extended double-precision floating-point
6037 | value `a' by the corresponding value `b'.  The operation is performed
6038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6039 *----------------------------------------------------------------------------*/
6040 
6041 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6042 {
6043     bool aSign, bSign, zSign;
6044     int32_t aExp, bExp, zExp;
6045     uint64_t aSig, bSig, zSig0, zSig1;
6046     uint64_t rem0, rem1, rem2, term0, term1, term2;
6047 
6048     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6049         float_raise(float_flag_invalid, status);
6050         return floatx80_default_nan(status);
6051     }
6052     aSig = extractFloatx80Frac( a );
6053     aExp = extractFloatx80Exp( a );
6054     aSign = extractFloatx80Sign( a );
6055     bSig = extractFloatx80Frac( b );
6056     bExp = extractFloatx80Exp( b );
6057     bSign = extractFloatx80Sign( b );
6058     zSign = aSign ^ bSign;
6059     if ( aExp == 0x7FFF ) {
6060         if ((uint64_t)(aSig << 1)) {
6061             return propagateFloatx80NaN(a, b, status);
6062         }
6063         if ( bExp == 0x7FFF ) {
6064             if ((uint64_t)(bSig << 1)) {
6065                 return propagateFloatx80NaN(a, b, status);
6066             }
6067             goto invalid;
6068         }
6069         return packFloatx80(zSign, floatx80_infinity_high,
6070                                    floatx80_infinity_low);
6071     }
6072     if ( bExp == 0x7FFF ) {
6073         if ((uint64_t)(bSig << 1)) {
6074             return propagateFloatx80NaN(a, b, status);
6075         }
6076         return packFloatx80( zSign, 0, 0 );
6077     }
6078     if ( bExp == 0 ) {
6079         if ( bSig == 0 ) {
6080             if ( ( aExp | aSig ) == 0 ) {
6081  invalid:
6082                 float_raise(float_flag_invalid, status);
6083                 return floatx80_default_nan(status);
6084             }
6085             float_raise(float_flag_divbyzero, status);
6086             return packFloatx80(zSign, floatx80_infinity_high,
6087                                        floatx80_infinity_low);
6088         }
6089         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6090     }
6091     if ( aExp == 0 ) {
6092         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6093         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6094     }
6095     zExp = aExp - bExp + 0x3FFE;
6096     rem1 = 0;
6097     if ( bSig <= aSig ) {
6098         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6099         ++zExp;
6100     }
6101     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6102     mul64To128( bSig, zSig0, &term0, &term1 );
6103     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6104     while ( (int64_t) rem0 < 0 ) {
6105         --zSig0;
6106         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6107     }
6108     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6109     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6110         mul64To128( bSig, zSig1, &term1, &term2 );
6111         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6112         while ( (int64_t) rem1 < 0 ) {
6113             --zSig1;
6114             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6115         }
6116         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6117     }
6118     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6119                                 zSign, zExp, zSig0, zSig1, status);
6120 }
6121 
6122 /*----------------------------------------------------------------------------
6123 | Returns the remainder of the extended double-precision floating-point value
6124 | `a' with respect to the corresponding value `b'.  The operation is performed
6125 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic,
6126 | if 'mod' is false; if 'mod' is true, return the remainder based on truncating
6127 | the quotient toward zero instead.  '*quotient' is set to the low 64 bits of
6128 | the absolute value of the integer quotient.
6129 *----------------------------------------------------------------------------*/
6130 
6131 floatx80 floatx80_modrem(floatx80 a, floatx80 b, bool mod, uint64_t *quotient,
6132                          float_status *status)
6133 {
6134     bool aSign, zSign;
6135     int32_t aExp, bExp, expDiff, aExpOrig;
6136     uint64_t aSig0, aSig1, bSig;
6137     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6138 
6139     *quotient = 0;
6140     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6141         float_raise(float_flag_invalid, status);
6142         return floatx80_default_nan(status);
6143     }
6144     aSig0 = extractFloatx80Frac( a );
6145     aExpOrig = aExp = extractFloatx80Exp( a );
6146     aSign = extractFloatx80Sign( a );
6147     bSig = extractFloatx80Frac( b );
6148     bExp = extractFloatx80Exp( b );
6149     if ( aExp == 0x7FFF ) {
6150         if (    (uint64_t) ( aSig0<<1 )
6151              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6152             return propagateFloatx80NaN(a, b, status);
6153         }
6154         goto invalid;
6155     }
6156     if ( bExp == 0x7FFF ) {
6157         if ((uint64_t)(bSig << 1)) {
6158             return propagateFloatx80NaN(a, b, status);
6159         }
6160         if (aExp == 0 && aSig0 >> 63) {
6161             /*
6162              * Pseudo-denormal argument must be returned in normalized
6163              * form.
6164              */
6165             return packFloatx80(aSign, 1, aSig0);
6166         }
6167         return a;
6168     }
6169     if ( bExp == 0 ) {
6170         if ( bSig == 0 ) {
6171  invalid:
6172             float_raise(float_flag_invalid, status);
6173             return floatx80_default_nan(status);
6174         }
6175         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6176     }
6177     if ( aExp == 0 ) {
6178         if ( aSig0 == 0 ) return a;
6179         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6180     }
6181     zSign = aSign;
6182     expDiff = aExp - bExp;
6183     aSig1 = 0;
6184     if ( expDiff < 0 ) {
6185         if ( mod || expDiff < -1 ) {
6186             if (aExp == 1 && aExpOrig == 0) {
6187                 /*
6188                  * Pseudo-denormal argument must be returned in
6189                  * normalized form.
6190                  */
6191                 return packFloatx80(aSign, aExp, aSig0);
6192             }
6193             return a;
6194         }
6195         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6196         expDiff = 0;
6197     }
6198     *quotient = q = ( bSig <= aSig0 );
6199     if ( q ) aSig0 -= bSig;
6200     expDiff -= 64;
6201     while ( 0 < expDiff ) {
6202         q = estimateDiv128To64( aSig0, aSig1, bSig );
6203         q = ( 2 < q ) ? q - 2 : 0;
6204         mul64To128( bSig, q, &term0, &term1 );
6205         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6206         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6207         expDiff -= 62;
6208         *quotient <<= 62;
6209         *quotient += q;
6210     }
6211     expDiff += 64;
6212     if ( 0 < expDiff ) {
6213         q = estimateDiv128To64( aSig0, aSig1, bSig );
6214         q = ( 2 < q ) ? q - 2 : 0;
6215         q >>= 64 - expDiff;
6216         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6217         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6218         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6219         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6220             ++q;
6221             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6222         }
6223         if (expDiff < 64) {
6224             *quotient <<= expDiff;
6225         } else {
6226             *quotient = 0;
6227         }
6228         *quotient += q;
6229     }
6230     else {
6231         term1 = 0;
6232         term0 = bSig;
6233     }
6234     if (!mod) {
6235         sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6236         if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6237                 || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6238                         && ( q & 1 ) )
6239             ) {
6240             aSig0 = alternateASig0;
6241             aSig1 = alternateASig1;
6242             zSign = ! zSign;
6243             ++*quotient;
6244         }
6245     }
6246     return
6247         normalizeRoundAndPackFloatx80(
6248             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6249 
6250 }
6251 
6252 /*----------------------------------------------------------------------------
6253 | Returns the remainder of the extended double-precision floating-point value
6254 | `a' with respect to the corresponding value `b'.  The operation is performed
6255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6256 *----------------------------------------------------------------------------*/
6257 
6258 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6259 {
6260     uint64_t quotient;
6261     return floatx80_modrem(a, b, false, &quotient, status);
6262 }
6263 
6264 /*----------------------------------------------------------------------------
6265 | Returns the remainder of the extended double-precision floating-point value
6266 | `a' with respect to the corresponding value `b', with the quotient truncated
6267 | toward zero.
6268 *----------------------------------------------------------------------------*/
6269 
6270 floatx80 floatx80_mod(floatx80 a, floatx80 b, float_status *status)
6271 {
6272     uint64_t quotient;
6273     return floatx80_modrem(a, b, true, &quotient, status);
6274 }
6275 
6276 /*----------------------------------------------------------------------------
6277 | Returns the square root of the extended double-precision floating-point
6278 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6279 | for Binary Floating-Point Arithmetic.
6280 *----------------------------------------------------------------------------*/
6281 
6282 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6283 {
6284     bool aSign;
6285     int32_t aExp, zExp;
6286     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6287     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6288 
6289     if (floatx80_invalid_encoding(a)) {
6290         float_raise(float_flag_invalid, status);
6291         return floatx80_default_nan(status);
6292     }
6293     aSig0 = extractFloatx80Frac( a );
6294     aExp = extractFloatx80Exp( a );
6295     aSign = extractFloatx80Sign( a );
6296     if ( aExp == 0x7FFF ) {
6297         if ((uint64_t)(aSig0 << 1)) {
6298             return propagateFloatx80NaN(a, a, status);
6299         }
6300         if ( ! aSign ) return a;
6301         goto invalid;
6302     }
6303     if ( aSign ) {
6304         if ( ( aExp | aSig0 ) == 0 ) return a;
6305  invalid:
6306         float_raise(float_flag_invalid, status);
6307         return floatx80_default_nan(status);
6308     }
6309     if ( aExp == 0 ) {
6310         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6311         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6312     }
6313     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6314     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6315     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6316     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6317     doubleZSig0 = zSig0<<1;
6318     mul64To128( zSig0, zSig0, &term0, &term1 );
6319     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6320     while ( (int64_t) rem0 < 0 ) {
6321         --zSig0;
6322         doubleZSig0 -= 2;
6323         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6324     }
6325     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6326     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6327         if ( zSig1 == 0 ) zSig1 = 1;
6328         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6329         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6330         mul64To128( zSig1, zSig1, &term2, &term3 );
6331         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6332         while ( (int64_t) rem1 < 0 ) {
6333             --zSig1;
6334             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6335             term3 |= 1;
6336             term2 |= doubleZSig0;
6337             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6338         }
6339         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6340     }
6341     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6342     zSig0 |= doubleZSig0;
6343     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6344                                 0, zExp, zSig0, zSig1, status);
6345 }
6346 
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of converting the quadruple-precision floating-point
6349 | value `a' to the 32-bit two's complement integer format.  The conversion
6350 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6351 | Arithmetic---which means in particular that the conversion is rounded
6352 | according to the current rounding mode.  If `a' is a NaN, the largest
6353 | positive integer is returned.  Otherwise, if the conversion overflows, the
6354 | largest integer with the same sign as `a' is returned.
6355 *----------------------------------------------------------------------------*/
6356 
6357 int32_t float128_to_int32(float128 a, float_status *status)
6358 {
6359     bool aSign;
6360     int32_t aExp, shiftCount;
6361     uint64_t aSig0, aSig1;
6362 
6363     aSig1 = extractFloat128Frac1( a );
6364     aSig0 = extractFloat128Frac0( a );
6365     aExp = extractFloat128Exp( a );
6366     aSign = extractFloat128Sign( a );
6367     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6368     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6369     aSig0 |= ( aSig1 != 0 );
6370     shiftCount = 0x4028 - aExp;
6371     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6372     return roundAndPackInt32(aSign, aSig0, status);
6373 
6374 }
6375 
6376 /*----------------------------------------------------------------------------
6377 | Returns the result of converting the quadruple-precision floating-point
6378 | value `a' to the 32-bit two's complement integer format.  The conversion
6379 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6380 | Arithmetic, except that the conversion is always rounded toward zero.  If
6381 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6382 | conversion overflows, the largest integer with the same sign as `a' is
6383 | returned.
6384 *----------------------------------------------------------------------------*/
6385 
6386 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6387 {
6388     bool aSign;
6389     int32_t aExp, shiftCount;
6390     uint64_t aSig0, aSig1, savedASig;
6391     int32_t z;
6392 
6393     aSig1 = extractFloat128Frac1( a );
6394     aSig0 = extractFloat128Frac0( a );
6395     aExp = extractFloat128Exp( a );
6396     aSign = extractFloat128Sign( a );
6397     aSig0 |= ( aSig1 != 0 );
6398     if ( 0x401E < aExp ) {
6399         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6400         goto invalid;
6401     }
6402     else if ( aExp < 0x3FFF ) {
6403         if (aExp || aSig0) {
6404             status->float_exception_flags |= float_flag_inexact;
6405         }
6406         return 0;
6407     }
6408     aSig0 |= UINT64_C(0x0001000000000000);
6409     shiftCount = 0x402F - aExp;
6410     savedASig = aSig0;
6411     aSig0 >>= shiftCount;
6412     z = aSig0;
6413     if ( aSign ) z = - z;
6414     if ( ( z < 0 ) ^ aSign ) {
6415  invalid:
6416         float_raise(float_flag_invalid, status);
6417         return aSign ? INT32_MIN : INT32_MAX;
6418     }
6419     if ( ( aSig0<<shiftCount ) != savedASig ) {
6420         status->float_exception_flags |= float_flag_inexact;
6421     }
6422     return z;
6423 
6424 }
6425 
6426 /*----------------------------------------------------------------------------
6427 | Returns the result of converting the quadruple-precision floating-point
6428 | value `a' to the 64-bit two's complement integer format.  The conversion
6429 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6430 | Arithmetic---which means in particular that the conversion is rounded
6431 | according to the current rounding mode.  If `a' is a NaN, the largest
6432 | positive integer is returned.  Otherwise, if the conversion overflows, the
6433 | largest integer with the same sign as `a' is returned.
6434 *----------------------------------------------------------------------------*/
6435 
6436 int64_t float128_to_int64(float128 a, float_status *status)
6437 {
6438     bool aSign;
6439     int32_t aExp, shiftCount;
6440     uint64_t aSig0, aSig1;
6441 
6442     aSig1 = extractFloat128Frac1( a );
6443     aSig0 = extractFloat128Frac0( a );
6444     aExp = extractFloat128Exp( a );
6445     aSign = extractFloat128Sign( a );
6446     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6447     shiftCount = 0x402F - aExp;
6448     if ( shiftCount <= 0 ) {
6449         if ( 0x403E < aExp ) {
6450             float_raise(float_flag_invalid, status);
6451             if (    ! aSign
6452                  || (    ( aExp == 0x7FFF )
6453                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6454                     )
6455                ) {
6456                 return INT64_MAX;
6457             }
6458             return INT64_MIN;
6459         }
6460         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6461     }
6462     else {
6463         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6464     }
6465     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6466 
6467 }
6468 
6469 /*----------------------------------------------------------------------------
6470 | Returns the result of converting the quadruple-precision floating-point
6471 | value `a' to the 64-bit two's complement integer format.  The conversion
6472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6473 | Arithmetic, except that the conversion is always rounded toward zero.
6474 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6475 | the conversion overflows, the largest integer with the same sign as `a' is
6476 | returned.
6477 *----------------------------------------------------------------------------*/
6478 
6479 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6480 {
6481     bool aSign;
6482     int32_t aExp, shiftCount;
6483     uint64_t aSig0, aSig1;
6484     int64_t z;
6485 
6486     aSig1 = extractFloat128Frac1( a );
6487     aSig0 = extractFloat128Frac0( a );
6488     aExp = extractFloat128Exp( a );
6489     aSign = extractFloat128Sign( a );
6490     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6491     shiftCount = aExp - 0x402F;
6492     if ( 0 < shiftCount ) {
6493         if ( 0x403E <= aExp ) {
6494             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6495             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6496                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6497                 if (aSig1) {
6498                     status->float_exception_flags |= float_flag_inexact;
6499                 }
6500             }
6501             else {
6502                 float_raise(float_flag_invalid, status);
6503                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6504                     return INT64_MAX;
6505                 }
6506             }
6507             return INT64_MIN;
6508         }
6509         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6510         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6511             status->float_exception_flags |= float_flag_inexact;
6512         }
6513     }
6514     else {
6515         if ( aExp < 0x3FFF ) {
6516             if ( aExp | aSig0 | aSig1 ) {
6517                 status->float_exception_flags |= float_flag_inexact;
6518             }
6519             return 0;
6520         }
6521         z = aSig0>>( - shiftCount );
6522         if (    aSig1
6523              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6524             status->float_exception_flags |= float_flag_inexact;
6525         }
6526     }
6527     if ( aSign ) z = - z;
6528     return z;
6529 
6530 }
6531 
6532 /*----------------------------------------------------------------------------
6533 | Returns the result of converting the quadruple-precision floating-point value
6534 | `a' to the 64-bit unsigned integer format.  The conversion is
6535 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6536 | Arithmetic---which means in particular that the conversion is rounded
6537 | according to the current rounding mode.  If `a' is a NaN, the largest
6538 | positive integer is returned.  If the conversion overflows, the
6539 | largest unsigned integer is returned.  If 'a' is negative, the value is
6540 | rounded and zero is returned; negative values that do not round to zero
6541 | will raise the inexact exception.
6542 *----------------------------------------------------------------------------*/
6543 
6544 uint64_t float128_to_uint64(float128 a, float_status *status)
6545 {
6546     bool aSign;
6547     int aExp;
6548     int shiftCount;
6549     uint64_t aSig0, aSig1;
6550 
6551     aSig0 = extractFloat128Frac0(a);
6552     aSig1 = extractFloat128Frac1(a);
6553     aExp = extractFloat128Exp(a);
6554     aSign = extractFloat128Sign(a);
6555     if (aSign && (aExp > 0x3FFE)) {
6556         float_raise(float_flag_invalid, status);
6557         if (float128_is_any_nan(a)) {
6558             return UINT64_MAX;
6559         } else {
6560             return 0;
6561         }
6562     }
6563     if (aExp) {
6564         aSig0 |= UINT64_C(0x0001000000000000);
6565     }
6566     shiftCount = 0x402F - aExp;
6567     if (shiftCount <= 0) {
6568         if (0x403E < aExp) {
6569             float_raise(float_flag_invalid, status);
6570             return UINT64_MAX;
6571         }
6572         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6573     } else {
6574         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6575     }
6576     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6577 }
6578 
6579 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6580 {
6581     uint64_t v;
6582     signed char current_rounding_mode = status->float_rounding_mode;
6583 
6584     set_float_rounding_mode(float_round_to_zero, status);
6585     v = float128_to_uint64(a, status);
6586     set_float_rounding_mode(current_rounding_mode, status);
6587 
6588     return v;
6589 }
6590 
6591 /*----------------------------------------------------------------------------
6592 | Returns the result of converting the quadruple-precision floating-point
6593 | value `a' to the 32-bit unsigned integer format.  The conversion
6594 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6595 | Arithmetic except that the conversion is always rounded toward zero.
6596 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6597 | if the conversion overflows, the largest unsigned integer is returned.
6598 | If 'a' is negative, the value is rounded and zero is returned; negative
6599 | values that do not round to zero will raise the inexact exception.
6600 *----------------------------------------------------------------------------*/
6601 
6602 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6603 {
6604     uint64_t v;
6605     uint32_t res;
6606     int old_exc_flags = get_float_exception_flags(status);
6607 
6608     v = float128_to_uint64_round_to_zero(a, status);
6609     if (v > 0xffffffff) {
6610         res = 0xffffffff;
6611     } else {
6612         return v;
6613     }
6614     set_float_exception_flags(old_exc_flags, status);
6615     float_raise(float_flag_invalid, status);
6616     return res;
6617 }
6618 
6619 /*----------------------------------------------------------------------------
6620 | Returns the result of converting the quadruple-precision floating-point value
6621 | `a' to the 32-bit unsigned integer format.  The conversion is
6622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6623 | Arithmetic---which means in particular that the conversion is rounded
6624 | according to the current rounding mode.  If `a' is a NaN, the largest
6625 | positive integer is returned.  If the conversion overflows, the
6626 | largest unsigned integer is returned.  If 'a' is negative, the value is
6627 | rounded and zero is returned; negative values that do not round to zero
6628 | will raise the inexact exception.
6629 *----------------------------------------------------------------------------*/
6630 
6631 uint32_t float128_to_uint32(float128 a, float_status *status)
6632 {
6633     uint64_t v;
6634     uint32_t res;
6635     int old_exc_flags = get_float_exception_flags(status);
6636 
6637     v = float128_to_uint64(a, status);
6638     if (v > 0xffffffff) {
6639         res = 0xffffffff;
6640     } else {
6641         return v;
6642     }
6643     set_float_exception_flags(old_exc_flags, status);
6644     float_raise(float_flag_invalid, status);
6645     return res;
6646 }
6647 
6648 /*----------------------------------------------------------------------------
6649 | Returns the result of converting the quadruple-precision floating-point
6650 | value `a' to the single-precision floating-point format.  The conversion
6651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6652 | Arithmetic.
6653 *----------------------------------------------------------------------------*/
6654 
6655 float32 float128_to_float32(float128 a, float_status *status)
6656 {
6657     bool aSign;
6658     int32_t aExp;
6659     uint64_t aSig0, aSig1;
6660     uint32_t zSig;
6661 
6662     aSig1 = extractFloat128Frac1( a );
6663     aSig0 = extractFloat128Frac0( a );
6664     aExp = extractFloat128Exp( a );
6665     aSign = extractFloat128Sign( a );
6666     if ( aExp == 0x7FFF ) {
6667         if ( aSig0 | aSig1 ) {
6668             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6669         }
6670         return packFloat32( aSign, 0xFF, 0 );
6671     }
6672     aSig0 |= ( aSig1 != 0 );
6673     shift64RightJamming( aSig0, 18, &aSig0 );
6674     zSig = aSig0;
6675     if ( aExp || zSig ) {
6676         zSig |= 0x40000000;
6677         aExp -= 0x3F81;
6678     }
6679     return roundAndPackFloat32(aSign, aExp, zSig, status);
6680 
6681 }
6682 
6683 /*----------------------------------------------------------------------------
6684 | Returns the result of converting the quadruple-precision floating-point
6685 | value `a' to the double-precision floating-point format.  The conversion
6686 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6687 | Arithmetic.
6688 *----------------------------------------------------------------------------*/
6689 
6690 float64 float128_to_float64(float128 a, float_status *status)
6691 {
6692     bool aSign;
6693     int32_t aExp;
6694     uint64_t aSig0, aSig1;
6695 
6696     aSig1 = extractFloat128Frac1( a );
6697     aSig0 = extractFloat128Frac0( a );
6698     aExp = extractFloat128Exp( a );
6699     aSign = extractFloat128Sign( a );
6700     if ( aExp == 0x7FFF ) {
6701         if ( aSig0 | aSig1 ) {
6702             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6703         }
6704         return packFloat64( aSign, 0x7FF, 0 );
6705     }
6706     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6707     aSig0 |= ( aSig1 != 0 );
6708     if ( aExp || aSig0 ) {
6709         aSig0 |= UINT64_C(0x4000000000000000);
6710         aExp -= 0x3C01;
6711     }
6712     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6713 
6714 }
6715 
6716 /*----------------------------------------------------------------------------
6717 | Returns the result of converting the quadruple-precision floating-point
6718 | value `a' to the extended double-precision floating-point format.  The
6719 | conversion is performed according to the IEC/IEEE Standard for Binary
6720 | Floating-Point Arithmetic.
6721 *----------------------------------------------------------------------------*/
6722 
6723 floatx80 float128_to_floatx80(float128 a, float_status *status)
6724 {
6725     bool aSign;
6726     int32_t aExp;
6727     uint64_t aSig0, aSig1;
6728 
6729     aSig1 = extractFloat128Frac1( a );
6730     aSig0 = extractFloat128Frac0( a );
6731     aExp = extractFloat128Exp( a );
6732     aSign = extractFloat128Sign( a );
6733     if ( aExp == 0x7FFF ) {
6734         if ( aSig0 | aSig1 ) {
6735             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6736                                                status);
6737             return floatx80_silence_nan(res, status);
6738         }
6739         return packFloatx80(aSign, floatx80_infinity_high,
6740                                    floatx80_infinity_low);
6741     }
6742     if ( aExp == 0 ) {
6743         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6744         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6745     }
6746     else {
6747         aSig0 |= UINT64_C(0x0001000000000000);
6748     }
6749     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6750     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6751 
6752 }
6753 
6754 /*----------------------------------------------------------------------------
6755 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6756 | returns the result as a quadruple-precision floating-point value.  The
6757 | operation is performed according to the IEC/IEEE Standard for Binary
6758 | Floating-Point Arithmetic.
6759 *----------------------------------------------------------------------------*/
6760 
6761 float128 float128_round_to_int(float128 a, float_status *status)
6762 {
6763     bool aSign;
6764     int32_t aExp;
6765     uint64_t lastBitMask, roundBitsMask;
6766     float128 z;
6767 
6768     aExp = extractFloat128Exp( a );
6769     if ( 0x402F <= aExp ) {
6770         if ( 0x406F <= aExp ) {
6771             if (    ( aExp == 0x7FFF )
6772                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6773                ) {
6774                 return propagateFloat128NaN(a, a, status);
6775             }
6776             return a;
6777         }
6778         lastBitMask = 1;
6779         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6780         roundBitsMask = lastBitMask - 1;
6781         z = a;
6782         switch (status->float_rounding_mode) {
6783         case float_round_nearest_even:
6784             if ( lastBitMask ) {
6785                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6786                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6787             }
6788             else {
6789                 if ( (int64_t) z.low < 0 ) {
6790                     ++z.high;
6791                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6792                 }
6793             }
6794             break;
6795         case float_round_ties_away:
6796             if (lastBitMask) {
6797                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6798             } else {
6799                 if ((int64_t) z.low < 0) {
6800                     ++z.high;
6801                 }
6802             }
6803             break;
6804         case float_round_to_zero:
6805             break;
6806         case float_round_up:
6807             if (!extractFloat128Sign(z)) {
6808                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6809             }
6810             break;
6811         case float_round_down:
6812             if (extractFloat128Sign(z)) {
6813                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6814             }
6815             break;
6816         case float_round_to_odd:
6817             /*
6818              * Note that if lastBitMask == 0, the last bit is the lsb
6819              * of high, and roundBitsMask == -1.
6820              */
6821             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
6822                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6823             }
6824             break;
6825         default:
6826             abort();
6827         }
6828         z.low &= ~ roundBitsMask;
6829     }
6830     else {
6831         if ( aExp < 0x3FFF ) {
6832             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6833             status->float_exception_flags |= float_flag_inexact;
6834             aSign = extractFloat128Sign( a );
6835             switch (status->float_rounding_mode) {
6836             case float_round_nearest_even:
6837                 if (    ( aExp == 0x3FFE )
6838                      && (   extractFloat128Frac0( a )
6839                           | extractFloat128Frac1( a ) )
6840                    ) {
6841                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6842                 }
6843                 break;
6844             case float_round_ties_away:
6845                 if (aExp == 0x3FFE) {
6846                     return packFloat128(aSign, 0x3FFF, 0, 0);
6847                 }
6848                 break;
6849             case float_round_down:
6850                 return
6851                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6852                     : packFloat128( 0, 0, 0, 0 );
6853             case float_round_up:
6854                 return
6855                       aSign ? packFloat128( 1, 0, 0, 0 )
6856                     : packFloat128( 0, 0x3FFF, 0, 0 );
6857 
6858             case float_round_to_odd:
6859                 return packFloat128(aSign, 0x3FFF, 0, 0);
6860 
6861             case float_round_to_zero:
6862                 break;
6863             }
6864             return packFloat128( aSign, 0, 0, 0 );
6865         }
6866         lastBitMask = 1;
6867         lastBitMask <<= 0x402F - aExp;
6868         roundBitsMask = lastBitMask - 1;
6869         z.low = 0;
6870         z.high = a.high;
6871         switch (status->float_rounding_mode) {
6872         case float_round_nearest_even:
6873             z.high += lastBitMask>>1;
6874             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6875                 z.high &= ~ lastBitMask;
6876             }
6877             break;
6878         case float_round_ties_away:
6879             z.high += lastBitMask>>1;
6880             break;
6881         case float_round_to_zero:
6882             break;
6883         case float_round_up:
6884             if (!extractFloat128Sign(z)) {
6885                 z.high |= ( a.low != 0 );
6886                 z.high += roundBitsMask;
6887             }
6888             break;
6889         case float_round_down:
6890             if (extractFloat128Sign(z)) {
6891                 z.high |= (a.low != 0);
6892                 z.high += roundBitsMask;
6893             }
6894             break;
6895         case float_round_to_odd:
6896             if ((z.high & lastBitMask) == 0) {
6897                 z.high |= (a.low != 0);
6898                 z.high += roundBitsMask;
6899             }
6900             break;
6901         default:
6902             abort();
6903         }
6904         z.high &= ~ roundBitsMask;
6905     }
6906     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6907         status->float_exception_flags |= float_flag_inexact;
6908     }
6909     return z;
6910 
6911 }
6912 
6913 /*----------------------------------------------------------------------------
6914 | Returns the result of adding the absolute values of the quadruple-precision
6915 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6916 | before being returned.  `zSign' is ignored if the result is a NaN.
6917 | The addition is performed according to the IEC/IEEE Standard for Binary
6918 | Floating-Point Arithmetic.
6919 *----------------------------------------------------------------------------*/
6920 
6921 static float128 addFloat128Sigs(float128 a, float128 b, bool zSign,
6922                                 float_status *status)
6923 {
6924     int32_t aExp, bExp, zExp;
6925     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6926     int32_t expDiff;
6927 
6928     aSig1 = extractFloat128Frac1( a );
6929     aSig0 = extractFloat128Frac0( a );
6930     aExp = extractFloat128Exp( a );
6931     bSig1 = extractFloat128Frac1( b );
6932     bSig0 = extractFloat128Frac0( b );
6933     bExp = extractFloat128Exp( b );
6934     expDiff = aExp - bExp;
6935     if ( 0 < expDiff ) {
6936         if ( aExp == 0x7FFF ) {
6937             if (aSig0 | aSig1) {
6938                 return propagateFloat128NaN(a, b, status);
6939             }
6940             return a;
6941         }
6942         if ( bExp == 0 ) {
6943             --expDiff;
6944         }
6945         else {
6946             bSig0 |= UINT64_C(0x0001000000000000);
6947         }
6948         shift128ExtraRightJamming(
6949             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6950         zExp = aExp;
6951     }
6952     else if ( expDiff < 0 ) {
6953         if ( bExp == 0x7FFF ) {
6954             if (bSig0 | bSig1) {
6955                 return propagateFloat128NaN(a, b, status);
6956             }
6957             return packFloat128( zSign, 0x7FFF, 0, 0 );
6958         }
6959         if ( aExp == 0 ) {
6960             ++expDiff;
6961         }
6962         else {
6963             aSig0 |= UINT64_C(0x0001000000000000);
6964         }
6965         shift128ExtraRightJamming(
6966             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6967         zExp = bExp;
6968     }
6969     else {
6970         if ( aExp == 0x7FFF ) {
6971             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6972                 return propagateFloat128NaN(a, b, status);
6973             }
6974             return a;
6975         }
6976         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6977         if ( aExp == 0 ) {
6978             if (status->flush_to_zero) {
6979                 if (zSig0 | zSig1) {
6980                     float_raise(float_flag_output_denormal, status);
6981                 }
6982                 return packFloat128(zSign, 0, 0, 0);
6983             }
6984             return packFloat128( zSign, 0, zSig0, zSig1 );
6985         }
6986         zSig2 = 0;
6987         zSig0 |= UINT64_C(0x0002000000000000);
6988         zExp = aExp;
6989         goto shiftRight1;
6990     }
6991     aSig0 |= UINT64_C(0x0001000000000000);
6992     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6993     --zExp;
6994     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
6995     ++zExp;
6996  shiftRight1:
6997     shift128ExtraRightJamming(
6998         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6999  roundAndPack:
7000     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7001 
7002 }
7003 
7004 /*----------------------------------------------------------------------------
7005 | Returns the result of subtracting the absolute values of the quadruple-
7006 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7007 | difference is negated before being returned.  `zSign' is ignored if the
7008 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7009 | Standard for Binary Floating-Point Arithmetic.
7010 *----------------------------------------------------------------------------*/
7011 
7012 static float128 subFloat128Sigs(float128 a, float128 b, bool zSign,
7013                                 float_status *status)
7014 {
7015     int32_t aExp, bExp, zExp;
7016     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7017     int32_t expDiff;
7018 
7019     aSig1 = extractFloat128Frac1( a );
7020     aSig0 = extractFloat128Frac0( a );
7021     aExp = extractFloat128Exp( a );
7022     bSig1 = extractFloat128Frac1( b );
7023     bSig0 = extractFloat128Frac0( b );
7024     bExp = extractFloat128Exp( b );
7025     expDiff = aExp - bExp;
7026     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7027     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7028     if ( 0 < expDiff ) goto aExpBigger;
7029     if ( expDiff < 0 ) goto bExpBigger;
7030     if ( aExp == 0x7FFF ) {
7031         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7032             return propagateFloat128NaN(a, b, status);
7033         }
7034         float_raise(float_flag_invalid, status);
7035         return float128_default_nan(status);
7036     }
7037     if ( aExp == 0 ) {
7038         aExp = 1;
7039         bExp = 1;
7040     }
7041     if ( bSig0 < aSig0 ) goto aBigger;
7042     if ( aSig0 < bSig0 ) goto bBigger;
7043     if ( bSig1 < aSig1 ) goto aBigger;
7044     if ( aSig1 < bSig1 ) goto bBigger;
7045     return packFloat128(status->float_rounding_mode == float_round_down,
7046                         0, 0, 0);
7047  bExpBigger:
7048     if ( bExp == 0x7FFF ) {
7049         if (bSig0 | bSig1) {
7050             return propagateFloat128NaN(a, b, status);
7051         }
7052         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7053     }
7054     if ( aExp == 0 ) {
7055         ++expDiff;
7056     }
7057     else {
7058         aSig0 |= UINT64_C(0x4000000000000000);
7059     }
7060     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7061     bSig0 |= UINT64_C(0x4000000000000000);
7062  bBigger:
7063     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7064     zExp = bExp;
7065     zSign ^= 1;
7066     goto normalizeRoundAndPack;
7067  aExpBigger:
7068     if ( aExp == 0x7FFF ) {
7069         if (aSig0 | aSig1) {
7070             return propagateFloat128NaN(a, b, status);
7071         }
7072         return a;
7073     }
7074     if ( bExp == 0 ) {
7075         --expDiff;
7076     }
7077     else {
7078         bSig0 |= UINT64_C(0x4000000000000000);
7079     }
7080     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7081     aSig0 |= UINT64_C(0x4000000000000000);
7082  aBigger:
7083     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7084     zExp = aExp;
7085  normalizeRoundAndPack:
7086     --zExp;
7087     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7088                                          status);
7089 
7090 }
7091 
7092 /*----------------------------------------------------------------------------
7093 | Returns the result of adding the quadruple-precision floating-point values
7094 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7095 | for Binary Floating-Point Arithmetic.
7096 *----------------------------------------------------------------------------*/
7097 
7098 float128 float128_add(float128 a, float128 b, float_status *status)
7099 {
7100     bool aSign, bSign;
7101 
7102     aSign = extractFloat128Sign( a );
7103     bSign = extractFloat128Sign( b );
7104     if ( aSign == bSign ) {
7105         return addFloat128Sigs(a, b, aSign, status);
7106     }
7107     else {
7108         return subFloat128Sigs(a, b, aSign, status);
7109     }
7110 
7111 }
7112 
7113 /*----------------------------------------------------------------------------
7114 | Returns the result of subtracting the quadruple-precision floating-point
7115 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7116 | Standard for Binary Floating-Point Arithmetic.
7117 *----------------------------------------------------------------------------*/
7118 
7119 float128 float128_sub(float128 a, float128 b, float_status *status)
7120 {
7121     bool aSign, bSign;
7122 
7123     aSign = extractFloat128Sign( a );
7124     bSign = extractFloat128Sign( b );
7125     if ( aSign == bSign ) {
7126         return subFloat128Sigs(a, b, aSign, status);
7127     }
7128     else {
7129         return addFloat128Sigs(a, b, aSign, status);
7130     }
7131 
7132 }
7133 
7134 /*----------------------------------------------------------------------------
7135 | Returns the result of multiplying the quadruple-precision floating-point
7136 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7137 | Standard for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7139 
7140 float128 float128_mul(float128 a, float128 b, float_status *status)
7141 {
7142     bool aSign, bSign, zSign;
7143     int32_t aExp, bExp, zExp;
7144     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7145 
7146     aSig1 = extractFloat128Frac1( a );
7147     aSig0 = extractFloat128Frac0( a );
7148     aExp = extractFloat128Exp( a );
7149     aSign = extractFloat128Sign( a );
7150     bSig1 = extractFloat128Frac1( b );
7151     bSig0 = extractFloat128Frac0( b );
7152     bExp = extractFloat128Exp( b );
7153     bSign = extractFloat128Sign( b );
7154     zSign = aSign ^ bSign;
7155     if ( aExp == 0x7FFF ) {
7156         if (    ( aSig0 | aSig1 )
7157              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7158             return propagateFloat128NaN(a, b, status);
7159         }
7160         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7161         return packFloat128( zSign, 0x7FFF, 0, 0 );
7162     }
7163     if ( bExp == 0x7FFF ) {
7164         if (bSig0 | bSig1) {
7165             return propagateFloat128NaN(a, b, status);
7166         }
7167         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7168  invalid:
7169             float_raise(float_flag_invalid, status);
7170             return float128_default_nan(status);
7171         }
7172         return packFloat128( zSign, 0x7FFF, 0, 0 );
7173     }
7174     if ( aExp == 0 ) {
7175         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7176         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7177     }
7178     if ( bExp == 0 ) {
7179         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7180         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7181     }
7182     zExp = aExp + bExp - 0x4000;
7183     aSig0 |= UINT64_C(0x0001000000000000);
7184     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7185     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7186     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7187     zSig2 |= ( zSig3 != 0 );
7188     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7189         shift128ExtraRightJamming(
7190             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7191         ++zExp;
7192     }
7193     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7194 
7195 }
7196 
7197 /*----------------------------------------------------------------------------
7198 | Returns the result of dividing the quadruple-precision floating-point value
7199 | `a' by the corresponding value `b'.  The operation is performed according to
7200 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7201 *----------------------------------------------------------------------------*/
7202 
7203 float128 float128_div(float128 a, float128 b, float_status *status)
7204 {
7205     bool aSign, bSign, zSign;
7206     int32_t aExp, bExp, zExp;
7207     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7208     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7209 
7210     aSig1 = extractFloat128Frac1( a );
7211     aSig0 = extractFloat128Frac0( a );
7212     aExp = extractFloat128Exp( a );
7213     aSign = extractFloat128Sign( a );
7214     bSig1 = extractFloat128Frac1( b );
7215     bSig0 = extractFloat128Frac0( b );
7216     bExp = extractFloat128Exp( b );
7217     bSign = extractFloat128Sign( b );
7218     zSign = aSign ^ bSign;
7219     if ( aExp == 0x7FFF ) {
7220         if (aSig0 | aSig1) {
7221             return propagateFloat128NaN(a, b, status);
7222         }
7223         if ( bExp == 0x7FFF ) {
7224             if (bSig0 | bSig1) {
7225                 return propagateFloat128NaN(a, b, status);
7226             }
7227             goto invalid;
7228         }
7229         return packFloat128( zSign, 0x7FFF, 0, 0 );
7230     }
7231     if ( bExp == 0x7FFF ) {
7232         if (bSig0 | bSig1) {
7233             return propagateFloat128NaN(a, b, status);
7234         }
7235         return packFloat128( zSign, 0, 0, 0 );
7236     }
7237     if ( bExp == 0 ) {
7238         if ( ( bSig0 | bSig1 ) == 0 ) {
7239             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7240  invalid:
7241                 float_raise(float_flag_invalid, status);
7242                 return float128_default_nan(status);
7243             }
7244             float_raise(float_flag_divbyzero, status);
7245             return packFloat128( zSign, 0x7FFF, 0, 0 );
7246         }
7247         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7248     }
7249     if ( aExp == 0 ) {
7250         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7251         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7252     }
7253     zExp = aExp - bExp + 0x3FFD;
7254     shortShift128Left(
7255         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7256     shortShift128Left(
7257         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7258     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7259         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7260         ++zExp;
7261     }
7262     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7263     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7264     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7265     while ( (int64_t) rem0 < 0 ) {
7266         --zSig0;
7267         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7268     }
7269     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7270     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7271         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7272         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7273         while ( (int64_t) rem1 < 0 ) {
7274             --zSig1;
7275             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7276         }
7277         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7278     }
7279     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7280     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7281 
7282 }
7283 
7284 /*----------------------------------------------------------------------------
7285 | Returns the remainder of the quadruple-precision floating-point value `a'
7286 | with respect to the corresponding value `b'.  The operation is performed
7287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7288 *----------------------------------------------------------------------------*/
7289 
7290 float128 float128_rem(float128 a, float128 b, float_status *status)
7291 {
7292     bool aSign, zSign;
7293     int32_t aExp, bExp, expDiff;
7294     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7295     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7296     int64_t sigMean0;
7297 
7298     aSig1 = extractFloat128Frac1( a );
7299     aSig0 = extractFloat128Frac0( a );
7300     aExp = extractFloat128Exp( a );
7301     aSign = extractFloat128Sign( a );
7302     bSig1 = extractFloat128Frac1( b );
7303     bSig0 = extractFloat128Frac0( b );
7304     bExp = extractFloat128Exp( b );
7305     if ( aExp == 0x7FFF ) {
7306         if (    ( aSig0 | aSig1 )
7307              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7308             return propagateFloat128NaN(a, b, status);
7309         }
7310         goto invalid;
7311     }
7312     if ( bExp == 0x7FFF ) {
7313         if (bSig0 | bSig1) {
7314             return propagateFloat128NaN(a, b, status);
7315         }
7316         return a;
7317     }
7318     if ( bExp == 0 ) {
7319         if ( ( bSig0 | bSig1 ) == 0 ) {
7320  invalid:
7321             float_raise(float_flag_invalid, status);
7322             return float128_default_nan(status);
7323         }
7324         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7325     }
7326     if ( aExp == 0 ) {
7327         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7328         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7329     }
7330     expDiff = aExp - bExp;
7331     if ( expDiff < -1 ) return a;
7332     shortShift128Left(
7333         aSig0 | UINT64_C(0x0001000000000000),
7334         aSig1,
7335         15 - ( expDiff < 0 ),
7336         &aSig0,
7337         &aSig1
7338     );
7339     shortShift128Left(
7340         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7341     q = le128( bSig0, bSig1, aSig0, aSig1 );
7342     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7343     expDiff -= 64;
7344     while ( 0 < expDiff ) {
7345         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7346         q = ( 4 < q ) ? q - 4 : 0;
7347         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7348         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7349         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7350         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7351         expDiff -= 61;
7352     }
7353     if ( -64 < expDiff ) {
7354         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7355         q = ( 4 < q ) ? q - 4 : 0;
7356         q >>= - expDiff;
7357         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7358         expDiff += 52;
7359         if ( expDiff < 0 ) {
7360             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7361         }
7362         else {
7363             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7364         }
7365         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7366         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7367     }
7368     else {
7369         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7370         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7371     }
7372     do {
7373         alternateASig0 = aSig0;
7374         alternateASig1 = aSig1;
7375         ++q;
7376         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7377     } while ( 0 <= (int64_t) aSig0 );
7378     add128(
7379         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7380     if (    ( sigMean0 < 0 )
7381          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7382         aSig0 = alternateASig0;
7383         aSig1 = alternateASig1;
7384     }
7385     zSign = ( (int64_t) aSig0 < 0 );
7386     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7387     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7388                                          status);
7389 }
7390 
7391 /*----------------------------------------------------------------------------
7392 | Returns the square root of the quadruple-precision floating-point value `a'.
7393 | The operation is performed according to the IEC/IEEE Standard for Binary
7394 | Floating-Point Arithmetic.
7395 *----------------------------------------------------------------------------*/
7396 
7397 float128 float128_sqrt(float128 a, float_status *status)
7398 {
7399     bool aSign;
7400     int32_t aExp, zExp;
7401     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7402     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7403 
7404     aSig1 = extractFloat128Frac1( a );
7405     aSig0 = extractFloat128Frac0( a );
7406     aExp = extractFloat128Exp( a );
7407     aSign = extractFloat128Sign( a );
7408     if ( aExp == 0x7FFF ) {
7409         if (aSig0 | aSig1) {
7410             return propagateFloat128NaN(a, a, status);
7411         }
7412         if ( ! aSign ) return a;
7413         goto invalid;
7414     }
7415     if ( aSign ) {
7416         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7417  invalid:
7418         float_raise(float_flag_invalid, status);
7419         return float128_default_nan(status);
7420     }
7421     if ( aExp == 0 ) {
7422         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7423         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7424     }
7425     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7426     aSig0 |= UINT64_C(0x0001000000000000);
7427     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7428     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7429     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7430     doubleZSig0 = zSig0<<1;
7431     mul64To128( zSig0, zSig0, &term0, &term1 );
7432     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7433     while ( (int64_t) rem0 < 0 ) {
7434         --zSig0;
7435         doubleZSig0 -= 2;
7436         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7437     }
7438     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7439     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7440         if ( zSig1 == 0 ) zSig1 = 1;
7441         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7442         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7443         mul64To128( zSig1, zSig1, &term2, &term3 );
7444         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7445         while ( (int64_t) rem1 < 0 ) {
7446             --zSig1;
7447             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7448             term3 |= 1;
7449             term2 |= doubleZSig0;
7450             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7451         }
7452         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7453     }
7454     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7455     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7456 
7457 }
7458 
7459 static inline FloatRelation
7460 floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,
7461                           float_status *status)
7462 {
7463     bool aSign, bSign;
7464 
7465     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7466         float_raise(float_flag_invalid, status);
7467         return float_relation_unordered;
7468     }
7469     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7470           ( extractFloatx80Frac( a )<<1 ) ) ||
7471         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7472           ( extractFloatx80Frac( b )<<1 ) )) {
7473         if (!is_quiet ||
7474             floatx80_is_signaling_nan(a, status) ||
7475             floatx80_is_signaling_nan(b, status)) {
7476             float_raise(float_flag_invalid, status);
7477         }
7478         return float_relation_unordered;
7479     }
7480     aSign = extractFloatx80Sign( a );
7481     bSign = extractFloatx80Sign( b );
7482     if ( aSign != bSign ) {
7483 
7484         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7485              ( ( a.low | b.low ) == 0 ) ) {
7486             /* zero case */
7487             return float_relation_equal;
7488         } else {
7489             return 1 - (2 * aSign);
7490         }
7491     } else {
7492         /* Normalize pseudo-denormals before comparison.  */
7493         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7494             ++a.high;
7495         }
7496         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7497             ++b.high;
7498         }
7499         if (a.low == b.low && a.high == b.high) {
7500             return float_relation_equal;
7501         } else {
7502             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7503         }
7504     }
7505 }
7506 
7507 FloatRelation floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7508 {
7509     return floatx80_compare_internal(a, b, 0, status);
7510 }
7511 
7512 FloatRelation floatx80_compare_quiet(floatx80 a, floatx80 b,
7513                                      float_status *status)
7514 {
7515     return floatx80_compare_internal(a, b, 1, status);
7516 }
7517 
7518 static inline FloatRelation
7519 float128_compare_internal(float128 a, float128 b, bool is_quiet,
7520                           float_status *status)
7521 {
7522     bool aSign, bSign;
7523 
7524     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7525           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7526         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7527           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7528         if (!is_quiet ||
7529             float128_is_signaling_nan(a, status) ||
7530             float128_is_signaling_nan(b, status)) {
7531             float_raise(float_flag_invalid, status);
7532         }
7533         return float_relation_unordered;
7534     }
7535     aSign = extractFloat128Sign( a );
7536     bSign = extractFloat128Sign( b );
7537     if ( aSign != bSign ) {
7538         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7539             /* zero case */
7540             return float_relation_equal;
7541         } else {
7542             return 1 - (2 * aSign);
7543         }
7544     } else {
7545         if (a.low == b.low && a.high == b.high) {
7546             return float_relation_equal;
7547         } else {
7548             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7549         }
7550     }
7551 }
7552 
7553 FloatRelation float128_compare(float128 a, float128 b, float_status *status)
7554 {
7555     return float128_compare_internal(a, b, 0, status);
7556 }
7557 
7558 FloatRelation float128_compare_quiet(float128 a, float128 b,
7559                                      float_status *status)
7560 {
7561     return float128_compare_internal(a, b, 1, status);
7562 }
7563 
7564 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7565 {
7566     bool aSign;
7567     int32_t aExp;
7568     uint64_t aSig;
7569 
7570     if (floatx80_invalid_encoding(a)) {
7571         float_raise(float_flag_invalid, status);
7572         return floatx80_default_nan(status);
7573     }
7574     aSig = extractFloatx80Frac( a );
7575     aExp = extractFloatx80Exp( a );
7576     aSign = extractFloatx80Sign( a );
7577 
7578     if ( aExp == 0x7FFF ) {
7579         if ( aSig<<1 ) {
7580             return propagateFloatx80NaN(a, a, status);
7581         }
7582         return a;
7583     }
7584 
7585     if (aExp == 0) {
7586         if (aSig == 0) {
7587             return a;
7588         }
7589         aExp++;
7590     }
7591 
7592     if (n > 0x10000) {
7593         n = 0x10000;
7594     } else if (n < -0x10000) {
7595         n = -0x10000;
7596     }
7597 
7598     aExp += n;
7599     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7600                                          aSign, aExp, aSig, 0, status);
7601 }
7602 
7603 float128 float128_scalbn(float128 a, int n, float_status *status)
7604 {
7605     bool aSign;
7606     int32_t aExp;
7607     uint64_t aSig0, aSig1;
7608 
7609     aSig1 = extractFloat128Frac1( a );
7610     aSig0 = extractFloat128Frac0( a );
7611     aExp = extractFloat128Exp( a );
7612     aSign = extractFloat128Sign( a );
7613     if ( aExp == 0x7FFF ) {
7614         if ( aSig0 | aSig1 ) {
7615             return propagateFloat128NaN(a, a, status);
7616         }
7617         return a;
7618     }
7619     if (aExp != 0) {
7620         aSig0 |= UINT64_C(0x0001000000000000);
7621     } else if (aSig0 == 0 && aSig1 == 0) {
7622         return a;
7623     } else {
7624         aExp++;
7625     }
7626 
7627     if (n > 0x10000) {
7628         n = 0x10000;
7629     } else if (n < -0x10000) {
7630         n = -0x10000;
7631     }
7632 
7633     aExp += n - 1;
7634     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7635                                          , status);
7636 
7637 }
7638 
7639 static void __attribute__((constructor)) softfloat_init(void)
7640 {
7641     union_float64 ua, ub, uc, ur;
7642 
7643     if (QEMU_NO_HARDFLOAT) {
7644         return;
7645     }
7646     /*
7647      * Test that the host's FMA is not obviously broken. For example,
7648      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
7649      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
7650      */
7651     ua.s = 0x0020000000000001ULL;
7652     ub.s = 0x3ca0000000000000ULL;
7653     uc.s = 0x0020000000000000ULL;
7654     ur.h = fma(ua.h, ub.h, uc.h);
7655     if (ur.s != 0x0020000000000001ULL) {
7656         force_soft_fma = true;
7657     }
7658 }
7659