xref: /openbmc/qemu/fpu/softfloat.c (revision 520e210c)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
700     const uint64_t round_mask = parm->round_mask;
701     const uint64_t roundeven_mask = parm->roundeven_mask;
702     const int exp_max = parm->exp_max;
703     const int frac_shift = parm->frac_shift;
704     uint64_t frac, inc;
705     int exp, flags = 0;
706     bool overflow_norm;
707 
708     frac = p.frac;
709     exp = p.exp;
710 
711     switch (p.cls) {
712     case float_class_normal:
713         switch (s->float_rounding_mode) {
714         case float_round_nearest_even:
715             overflow_norm = false;
716             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
717             break;
718         case float_round_ties_away:
719             overflow_norm = false;
720             inc = frac_lsbm1;
721             break;
722         case float_round_to_zero:
723             overflow_norm = true;
724             inc = 0;
725             break;
726         case float_round_up:
727             inc = p.sign ? 0 : round_mask;
728             overflow_norm = p.sign;
729             break;
730         case float_round_down:
731             inc = p.sign ? round_mask : 0;
732             overflow_norm = !p.sign;
733             break;
734         default:
735             g_assert_not_reached();
736         }
737 
738         exp += parm->exp_bias;
739         if (likely(exp > 0)) {
740             if (frac & round_mask) {
741                 flags |= float_flag_inexact;
742                 frac += inc;
743                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
744                     frac >>= 1;
745                     exp++;
746                 }
747             }
748             frac >>= frac_shift;
749 
750             if (parm->arm_althp) {
751                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
752                 if (unlikely(exp > exp_max)) {
753                     /* Overflow.  Return the maximum normal.  */
754                     flags = float_flag_invalid;
755                     exp = exp_max;
756                     frac = -1;
757                 }
758             } else if (unlikely(exp >= exp_max)) {
759                 flags |= float_flag_overflow | float_flag_inexact;
760                 if (overflow_norm) {
761                     exp = exp_max - 1;
762                     frac = -1;
763                 } else {
764                     p.cls = float_class_inf;
765                     goto do_inf;
766                 }
767             }
768         } else if (s->flush_to_zero) {
769             flags |= float_flag_output_denormal;
770             p.cls = float_class_zero;
771             goto do_zero;
772         } else {
773             bool is_tiny = (s->float_detect_tininess
774                             == float_tininess_before_rounding)
775                         || (exp < 0)
776                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
777 
778             shift64RightJamming(frac, 1 - exp, &frac);
779             if (frac & round_mask) {
780                 /* Need to recompute round-to-even.  */
781                 if (s->float_rounding_mode == float_round_nearest_even) {
782                     inc = ((frac & roundeven_mask) != frac_lsbm1
783                            ? frac_lsbm1 : 0);
784                 }
785                 flags |= float_flag_inexact;
786                 frac += inc;
787             }
788 
789             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
790             frac >>= frac_shift;
791 
792             if (is_tiny && (flags & float_flag_inexact)) {
793                 flags |= float_flag_underflow;
794             }
795             if (exp == 0 && frac == 0) {
796                 p.cls = float_class_zero;
797             }
798         }
799         break;
800 
801     case float_class_zero:
802     do_zero:
803         exp = 0;
804         frac = 0;
805         break;
806 
807     case float_class_inf:
808     do_inf:
809         assert(!parm->arm_althp);
810         exp = exp_max;
811         frac = 0;
812         break;
813 
814     case float_class_qnan:
815     case float_class_snan:
816         assert(!parm->arm_althp);
817         exp = exp_max;
818         frac >>= parm->frac_shift;
819         break;
820 
821     default:
822         g_assert_not_reached();
823     }
824 
825     float_raise(flags, s);
826     p.exp = exp;
827     p.frac = frac;
828     return p;
829 }
830 
831 /* Explicit FloatFmt version */
832 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
833                                             const FloatFmt *params)
834 {
835     return sf_canonicalize(float16_unpack_raw(f), params, s);
836 }
837 
838 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
839 {
840     return float16a_unpack_canonical(f, s, &float16_params);
841 }
842 
843 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
844                                              const FloatFmt *params)
845 {
846     return float16_pack_raw(round_canonical(p, s, params));
847 }
848 
849 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
850 {
851     return float16a_round_pack_canonical(p, s, &float16_params);
852 }
853 
854 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
855 {
856     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
857 }
858 
859 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float32_pack_raw(round_canonical(p, s, &float32_params));
862 }
863 
864 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
865 {
866     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
867 }
868 
869 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float64_pack_raw(round_canonical(p, s, &float64_params));
872 }
873 
874 static FloatParts return_nan(FloatParts a, float_status *s)
875 {
876     switch (a.cls) {
877     case float_class_snan:
878         s->float_exception_flags |= float_flag_invalid;
879         a = parts_silence_nan(a, s);
880         /* fall through */
881     case float_class_qnan:
882         if (s->default_nan_mode) {
883             return parts_default_nan(s);
884         }
885         break;
886 
887     default:
888         g_assert_not_reached();
889     }
890     return a;
891 }
892 
893 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
894 {
895     if (is_snan(a.cls) || is_snan(b.cls)) {
896         s->float_exception_flags |= float_flag_invalid;
897     }
898 
899     if (s->default_nan_mode) {
900         return parts_default_nan(s);
901     } else {
902         if (pickNaN(a.cls, b.cls,
903                     a.frac > b.frac ||
904                     (a.frac == b.frac && a.sign < b.sign))) {
905             a = b;
906         }
907         if (is_snan(a.cls)) {
908             return parts_silence_nan(a, s);
909         }
910     }
911     return a;
912 }
913 
914 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
915                                   bool inf_zero, float_status *s)
916 {
917     int which;
918 
919     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
920         s->float_exception_flags |= float_flag_invalid;
921     }
922 
923     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
924 
925     if (s->default_nan_mode) {
926         /* Note that this check is after pickNaNMulAdd so that function
927          * has an opportunity to set the Invalid flag.
928          */
929         which = 3;
930     }
931 
932     switch (which) {
933     case 0:
934         break;
935     case 1:
936         a = b;
937         break;
938     case 2:
939         a = c;
940         break;
941     case 3:
942         return parts_default_nan(s);
943     default:
944         g_assert_not_reached();
945     }
946 
947     if (is_snan(a.cls)) {
948         return parts_silence_nan(a, s);
949     }
950     return a;
951 }
952 
953 /*
954  * Returns the result of adding or subtracting the values of the
955  * floating-point values `a' and `b'. The operation is performed
956  * according to the IEC/IEEE Standard for Binary Floating-Point
957  * Arithmetic.
958  */
959 
960 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
961                                 float_status *s)
962 {
963     bool a_sign = a.sign;
964     bool b_sign = b.sign ^ subtract;
965 
966     if (a_sign != b_sign) {
967         /* Subtraction */
968 
969         if (a.cls == float_class_normal && b.cls == float_class_normal) {
970             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
971                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
972                 a.frac = a.frac - b.frac;
973             } else {
974                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
975                 a.frac = b.frac - a.frac;
976                 a.exp = b.exp;
977                 a_sign ^= 1;
978             }
979 
980             if (a.frac == 0) {
981                 a.cls = float_class_zero;
982                 a.sign = s->float_rounding_mode == float_round_down;
983             } else {
984                 int shift = clz64(a.frac) - 1;
985                 a.frac = a.frac << shift;
986                 a.exp = a.exp - shift;
987                 a.sign = a_sign;
988             }
989             return a;
990         }
991         if (is_nan(a.cls) || is_nan(b.cls)) {
992             return pick_nan(a, b, s);
993         }
994         if (a.cls == float_class_inf) {
995             if (b.cls == float_class_inf) {
996                 float_raise(float_flag_invalid, s);
997                 return parts_default_nan(s);
998             }
999             return a;
1000         }
1001         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1002             a.sign = s->float_rounding_mode == float_round_down;
1003             return a;
1004         }
1005         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1006             b.sign = a_sign ^ 1;
1007             return b;
1008         }
1009         if (b.cls == float_class_zero) {
1010             return a;
1011         }
1012     } else {
1013         /* Addition */
1014         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1015             if (a.exp > b.exp) {
1016                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1017             } else if (a.exp < b.exp) {
1018                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1019                 a.exp = b.exp;
1020             }
1021             a.frac += b.frac;
1022             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1023                 shift64RightJamming(a.frac, 1, &a.frac);
1024                 a.exp += 1;
1025             }
1026             return a;
1027         }
1028         if (is_nan(a.cls) || is_nan(b.cls)) {
1029             return pick_nan(a, b, s);
1030         }
1031         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1032             return a;
1033         }
1034         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1035             b.sign = b_sign;
1036             return b;
1037         }
1038     }
1039     g_assert_not_reached();
1040 }
1041 
1042 /*
1043  * Returns the result of adding or subtracting the floating-point
1044  * values `a' and `b'. The operation is performed according to the
1045  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1046  */
1047 
1048 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1049 {
1050     FloatParts pa = float16_unpack_canonical(a, status);
1051     FloatParts pb = float16_unpack_canonical(b, status);
1052     FloatParts pr = addsub_floats(pa, pb, false, status);
1053 
1054     return float16_round_pack_canonical(pr, status);
1055 }
1056 
1057 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1058 {
1059     FloatParts pa = float16_unpack_canonical(a, status);
1060     FloatParts pb = float16_unpack_canonical(b, status);
1061     FloatParts pr = addsub_floats(pa, pb, true, status);
1062 
1063     return float16_round_pack_canonical(pr, status);
1064 }
1065 
1066 static float32 QEMU_SOFTFLOAT_ATTR
1067 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1068 {
1069     FloatParts pa = float32_unpack_canonical(a, status);
1070     FloatParts pb = float32_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1072 
1073     return float32_round_pack_canonical(pr, status);
1074 }
1075 
1076 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1077 {
1078     return soft_f32_addsub(a, b, false, status);
1079 }
1080 
1081 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1082 {
1083     return soft_f32_addsub(a, b, true, status);
1084 }
1085 
1086 static float64 QEMU_SOFTFLOAT_ATTR
1087 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1088 {
1089     FloatParts pa = float64_unpack_canonical(a, status);
1090     FloatParts pb = float64_unpack_canonical(b, status);
1091     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1092 
1093     return float64_round_pack_canonical(pr, status);
1094 }
1095 
1096 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1097 {
1098     return soft_f64_addsub(a, b, false, status);
1099 }
1100 
1101 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1102 {
1103     return soft_f64_addsub(a, b, true, status);
1104 }
1105 
1106 static float hard_f32_add(float a, float b)
1107 {
1108     return a + b;
1109 }
1110 
1111 static float hard_f32_sub(float a, float b)
1112 {
1113     return a - b;
1114 }
1115 
1116 static double hard_f64_add(double a, double b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static double hard_f64_sub(double a, double b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static bool f32_addsub_post(union_float32 a, union_float32 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     }
1131     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1132 }
1133 
1134 static bool f64_addsub_post(union_float64 a, union_float64 b)
1135 {
1136     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1137         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1138     } else {
1139         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1140     }
1141 }
1142 
1143 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1144                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1145 {
1146     return float32_gen2(a, b, s, hard, soft,
1147                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1148 }
1149 
1150 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1151                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1152 {
1153     return float64_gen2(a, b, s, hard, soft,
1154                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1155 }
1156 
1157 float32 QEMU_FLATTEN
1158 float32_add(float32 a, float32 b, float_status *s)
1159 {
1160     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1161 }
1162 
1163 float32 QEMU_FLATTEN
1164 float32_sub(float32 a, float32 b, float_status *s)
1165 {
1166     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1167 }
1168 
1169 float64 QEMU_FLATTEN
1170 float64_add(float64 a, float64 b, float_status *s)
1171 {
1172     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1173 }
1174 
1175 float64 QEMU_FLATTEN
1176 float64_sub(float64 a, float64 b, float_status *s)
1177 {
1178     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1179 }
1180 
1181 /*
1182  * Returns the result of multiplying the floating-point values `a' and
1183  * `b'. The operation is performed according to the IEC/IEEE Standard
1184  * for Binary Floating-Point Arithmetic.
1185  */
1186 
1187 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1188 {
1189     bool sign = a.sign ^ b.sign;
1190 
1191     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1192         uint64_t hi, lo;
1193         int exp = a.exp + b.exp;
1194 
1195         mul64To128(a.frac, b.frac, &hi, &lo);
1196         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1197         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1198             shift64RightJamming(lo, 1, &lo);
1199             exp += 1;
1200         }
1201 
1202         /* Re-use a */
1203         a.exp = exp;
1204         a.sign = sign;
1205         a.frac = lo;
1206         return a;
1207     }
1208     /* handle all the NaN cases */
1209     if (is_nan(a.cls) || is_nan(b.cls)) {
1210         return pick_nan(a, b, s);
1211     }
1212     /* Inf * Zero == NaN */
1213     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1214         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1215         s->float_exception_flags |= float_flag_invalid;
1216         return parts_default_nan(s);
1217     }
1218     /* Multiply by 0 or Inf */
1219     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1220         a.sign = sign;
1221         return a;
1222     }
1223     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1224         b.sign = sign;
1225         return b;
1226     }
1227     g_assert_not_reached();
1228 }
1229 
1230 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1231 {
1232     FloatParts pa = float16_unpack_canonical(a, status);
1233     FloatParts pb = float16_unpack_canonical(b, status);
1234     FloatParts pr = mul_floats(pa, pb, status);
1235 
1236     return float16_round_pack_canonical(pr, status);
1237 }
1238 
1239 static float32 QEMU_SOFTFLOAT_ATTR
1240 soft_f32_mul(float32 a, float32 b, float_status *status)
1241 {
1242     FloatParts pa = float32_unpack_canonical(a, status);
1243     FloatParts pb = float32_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float32_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float64 QEMU_SOFTFLOAT_ATTR
1250 soft_f64_mul(float64 a, float64 b, float_status *status)
1251 {
1252     FloatParts pa = float64_unpack_canonical(a, status);
1253     FloatParts pb = float64_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float64_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float hard_f32_mul(float a, float b)
1260 {
1261     return a * b;
1262 }
1263 
1264 static double hard_f64_mul(double a, double b)
1265 {
1266     return a * b;
1267 }
1268 
1269 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1270 {
1271     return float32_is_zero(a.s) || float32_is_zero(b.s);
1272 }
1273 
1274 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1275 {
1276     return float64_is_zero(a.s) || float64_is_zero(b.s);
1277 }
1278 
1279 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1280 {
1281     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1282 
1283     return float32_set_sign(float32_zero, signbit);
1284 }
1285 
1286 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1287 {
1288     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1289 
1290     return float64_set_sign(float64_zero, signbit);
1291 }
1292 
1293 float32 QEMU_FLATTEN
1294 float32_mul(float32 a, float32 b, float_status *s)
1295 {
1296     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1297                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1298 }
1299 
1300 float64 QEMU_FLATTEN
1301 float64_mul(float64 a, float64 b, float_status *s)
1302 {
1303     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1304                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1305 }
1306 
1307 /*
1308  * Returns the result of multiplying the floating-point values `a' and
1309  * `b' then adding 'c', with no intermediate rounding step after the
1310  * multiplication. The operation is performed according to the
1311  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1312  * The flags argument allows the caller to select negation of the
1313  * addend, the intermediate product, or the final result. (The
1314  * difference between this and having the caller do a separate
1315  * negation is that negating externally will flip the sign bit on
1316  * NaNs.)
1317  */
1318 
1319 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1320                                 int flags, float_status *s)
1321 {
1322     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1323                     ((1 << float_class_inf) | (1 << float_class_zero));
1324     bool p_sign;
1325     bool sign_flip = flags & float_muladd_negate_result;
1326     FloatClass p_class;
1327     uint64_t hi, lo;
1328     int p_exp;
1329 
1330     /* It is implementation-defined whether the cases of (0,inf,qnan)
1331      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1332      * they return if they do), so we have to hand this information
1333      * off to the target-specific pick-a-NaN routine.
1334      */
1335     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1336         return pick_nan_muladd(a, b, c, inf_zero, s);
1337     }
1338 
1339     if (inf_zero) {
1340         s->float_exception_flags |= float_flag_invalid;
1341         return parts_default_nan(s);
1342     }
1343 
1344     if (flags & float_muladd_negate_c) {
1345         c.sign ^= 1;
1346     }
1347 
1348     p_sign = a.sign ^ b.sign;
1349 
1350     if (flags & float_muladd_negate_product) {
1351         p_sign ^= 1;
1352     }
1353 
1354     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1355         p_class = float_class_inf;
1356     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1357         p_class = float_class_zero;
1358     } else {
1359         p_class = float_class_normal;
1360     }
1361 
1362     if (c.cls == float_class_inf) {
1363         if (p_class == float_class_inf && p_sign != c.sign) {
1364             s->float_exception_flags |= float_flag_invalid;
1365             return parts_default_nan(s);
1366         } else {
1367             a.cls = float_class_inf;
1368             a.sign = c.sign ^ sign_flip;
1369             return a;
1370         }
1371     }
1372 
1373     if (p_class == float_class_inf) {
1374         a.cls = float_class_inf;
1375         a.sign = p_sign ^ sign_flip;
1376         return a;
1377     }
1378 
1379     if (p_class == float_class_zero) {
1380         if (c.cls == float_class_zero) {
1381             if (p_sign != c.sign) {
1382                 p_sign = s->float_rounding_mode == float_round_down;
1383             }
1384             c.sign = p_sign;
1385         } else if (flags & float_muladd_halve_result) {
1386             c.exp -= 1;
1387         }
1388         c.sign ^= sign_flip;
1389         return c;
1390     }
1391 
1392     /* a & b should be normals now... */
1393     assert(a.cls == float_class_normal &&
1394            b.cls == float_class_normal);
1395 
1396     p_exp = a.exp + b.exp;
1397 
1398     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1399      * result.
1400      */
1401     mul64To128(a.frac, b.frac, &hi, &lo);
1402     /* binary point now at bit 124 */
1403 
1404     /* check for overflow */
1405     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1406         shift128RightJamming(hi, lo, 1, &hi, &lo);
1407         p_exp += 1;
1408     }
1409 
1410     /* + add/sub */
1411     if (c.cls == float_class_zero) {
1412         /* move binary point back to 62 */
1413         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1414     } else {
1415         int exp_diff = p_exp - c.exp;
1416         if (p_sign == c.sign) {
1417             /* Addition */
1418             if (exp_diff <= 0) {
1419                 shift128RightJamming(hi, lo,
1420                                      DECOMPOSED_BINARY_POINT - exp_diff,
1421                                      &hi, &lo);
1422                 lo += c.frac;
1423                 p_exp = c.exp;
1424             } else {
1425                 uint64_t c_hi, c_lo;
1426                 /* shift c to the same binary point as the product (124) */
1427                 c_hi = c.frac >> 2;
1428                 c_lo = 0;
1429                 shift128RightJamming(c_hi, c_lo,
1430                                      exp_diff,
1431                                      &c_hi, &c_lo);
1432                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1433                 /* move binary point back to 62 */
1434                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1435             }
1436 
1437             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1438                 shift64RightJamming(lo, 1, &lo);
1439                 p_exp += 1;
1440             }
1441 
1442         } else {
1443             /* Subtraction */
1444             uint64_t c_hi, c_lo;
1445             /* make C binary point match product at bit 124 */
1446             c_hi = c.frac >> 2;
1447             c_lo = 0;
1448 
1449             if (exp_diff <= 0) {
1450                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1451                 if (exp_diff == 0
1452                     &&
1453                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1454                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1455                 } else {
1456                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1457                     p_sign ^= 1;
1458                     p_exp = c.exp;
1459                 }
1460             } else {
1461                 shift128RightJamming(c_hi, c_lo,
1462                                      exp_diff,
1463                                      &c_hi, &c_lo);
1464                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465             }
1466 
1467             if (hi == 0 && lo == 0) {
1468                 a.cls = float_class_zero;
1469                 a.sign = s->float_rounding_mode == float_round_down;
1470                 a.sign ^= sign_flip;
1471                 return a;
1472             } else {
1473                 int shift;
1474                 if (hi != 0) {
1475                     shift = clz64(hi);
1476                 } else {
1477                     shift = clz64(lo) + 64;
1478                 }
1479                 /* Normalizing to a binary point of 124 is the
1480                    correct adjust for the exponent.  However since we're
1481                    shifting, we might as well put the binary point back
1482                    at 62 where we really want it.  Therefore shift as
1483                    if we're leaving 1 bit at the top of the word, but
1484                    adjust the exponent as if we're leaving 3 bits.  */
1485                 shift -= 1;
1486                 if (shift >= 64) {
1487                     lo = lo << (shift - 64);
1488                 } else {
1489                     hi = (hi << shift) | (lo >> (64 - shift));
1490                     lo = hi | ((lo << shift) != 0);
1491                 }
1492                 p_exp -= shift - 2;
1493             }
1494         }
1495     }
1496 
1497     if (flags & float_muladd_halve_result) {
1498         p_exp -= 1;
1499     }
1500 
1501     /* finally prepare our result */
1502     a.cls = float_class_normal;
1503     a.sign = p_sign ^ sign_flip;
1504     a.exp = p_exp;
1505     a.frac = lo;
1506 
1507     return a;
1508 }
1509 
1510 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1511                                                 int flags, float_status *status)
1512 {
1513     FloatParts pa = float16_unpack_canonical(a, status);
1514     FloatParts pb = float16_unpack_canonical(b, status);
1515     FloatParts pc = float16_unpack_canonical(c, status);
1516     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1517 
1518     return float16_round_pack_canonical(pr, status);
1519 }
1520 
1521 static float32 QEMU_SOFTFLOAT_ATTR
1522 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1523                 float_status *status)
1524 {
1525     FloatParts pa = float32_unpack_canonical(a, status);
1526     FloatParts pb = float32_unpack_canonical(b, status);
1527     FloatParts pc = float32_unpack_canonical(c, status);
1528     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1529 
1530     return float32_round_pack_canonical(pr, status);
1531 }
1532 
1533 static float64 QEMU_SOFTFLOAT_ATTR
1534 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1535                 float_status *status)
1536 {
1537     FloatParts pa = float64_unpack_canonical(a, status);
1538     FloatParts pb = float64_unpack_canonical(b, status);
1539     FloatParts pc = float64_unpack_canonical(c, status);
1540     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1541 
1542     return float64_round_pack_canonical(pr, status);
1543 }
1544 
1545 static bool force_soft_fma;
1546 
1547 float32 QEMU_FLATTEN
1548 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1549 {
1550     union_float32 ua, ub, uc, ur;
1551 
1552     ua.s = xa;
1553     ub.s = xb;
1554     uc.s = xc;
1555 
1556     if (unlikely(!can_use_fpu(s))) {
1557         goto soft;
1558     }
1559     if (unlikely(flags & float_muladd_halve_result)) {
1560         goto soft;
1561     }
1562 
1563     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1564     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1565         goto soft;
1566     }
1567 
1568     if (unlikely(force_soft_fma)) {
1569         goto soft;
1570     }
1571 
1572     /*
1573      * When (a || b) == 0, there's no need to check for under/over flow,
1574      * since we know the addend is (normal || 0) and the product is 0.
1575      */
1576     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1577         union_float32 up;
1578         bool prod_sign;
1579 
1580         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1581         prod_sign ^= !!(flags & float_muladd_negate_product);
1582         up.s = float32_set_sign(float32_zero, prod_sign);
1583 
1584         if (flags & float_muladd_negate_c) {
1585             uc.h = -uc.h;
1586         }
1587         ur.h = up.h + uc.h;
1588     } else {
1589         if (flags & float_muladd_negate_product) {
1590             ua.h = -ua.h;
1591         }
1592         if (flags & float_muladd_negate_c) {
1593             uc.h = -uc.h;
1594         }
1595 
1596         ur.h = fmaf(ua.h, ub.h, uc.h);
1597 
1598         if (unlikely(f32_is_inf(ur))) {
1599             s->float_exception_flags |= float_flag_overflow;
1600         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1601             goto soft;
1602         }
1603     }
1604     if (flags & float_muladd_negate_result) {
1605         return float32_chs(ur.s);
1606     }
1607     return ur.s;
1608 
1609  soft:
1610     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1611 }
1612 
1613 float64 QEMU_FLATTEN
1614 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1615 {
1616     union_float64 ua, ub, uc, ur;
1617 
1618     ua.s = xa;
1619     ub.s = xb;
1620     uc.s = xc;
1621 
1622     if (unlikely(!can_use_fpu(s))) {
1623         goto soft;
1624     }
1625     if (unlikely(flags & float_muladd_halve_result)) {
1626         goto soft;
1627     }
1628 
1629     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1630     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1631         goto soft;
1632     }
1633 
1634     if (unlikely(force_soft_fma)) {
1635         goto soft;
1636     }
1637 
1638     /*
1639      * When (a || b) == 0, there's no need to check for under/over flow,
1640      * since we know the addend is (normal || 0) and the product is 0.
1641      */
1642     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1643         union_float64 up;
1644         bool prod_sign;
1645 
1646         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1647         prod_sign ^= !!(flags & float_muladd_negate_product);
1648         up.s = float64_set_sign(float64_zero, prod_sign);
1649 
1650         if (flags & float_muladd_negate_c) {
1651             uc.h = -uc.h;
1652         }
1653         ur.h = up.h + uc.h;
1654     } else {
1655         if (flags & float_muladd_negate_product) {
1656             ua.h = -ua.h;
1657         }
1658         if (flags & float_muladd_negate_c) {
1659             uc.h = -uc.h;
1660         }
1661 
1662         ur.h = fma(ua.h, ub.h, uc.h);
1663 
1664         if (unlikely(f64_is_inf(ur))) {
1665             s->float_exception_flags |= float_flag_overflow;
1666         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667             goto soft;
1668         }
1669     }
1670     if (flags & float_muladd_negate_result) {
1671         return float64_chs(ur.s);
1672     }
1673     return ur.s;
1674 
1675  soft:
1676     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1677 }
1678 
1679 /*
1680  * Returns the result of dividing the floating-point value `a' by the
1681  * corresponding value `b'. The operation is performed according to
1682  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1683  */
1684 
1685 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1686 {
1687     bool sign = a.sign ^ b.sign;
1688 
1689     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1690         uint64_t n0, n1, q, r;
1691         int exp = a.exp - b.exp;
1692 
1693         /*
1694          * We want a 2*N / N-bit division to produce exactly an N-bit
1695          * result, so that we do not lose any precision and so that we
1696          * do not have to renormalize afterward.  If A.frac < B.frac,
1697          * then division would produce an (N-1)-bit result; shift A left
1698          * by one to produce the an N-bit result, and decrement the
1699          * exponent to match.
1700          *
1701          * The udiv_qrnnd algorithm that we're using requires normalization,
1702          * i.e. the msb of the denominator must be set.  Since we know that
1703          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1704          * by one (more), and the remainder must be shifted right by one.
1705          */
1706         if (a.frac < b.frac) {
1707             exp -= 1;
1708             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1709         } else {
1710             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1711         }
1712         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1713 
1714         /*
1715          * Set lsb if there is a remainder, to set inexact.
1716          * As mentioned above, to find the actual value of the remainder we
1717          * would need to shift right, but (1) we are only concerned about
1718          * non-zero-ness, and (2) the remainder will always be even because
1719          * both inputs to the division primitive are even.
1720          */
1721         a.frac = q | (r != 0);
1722         a.sign = sign;
1723         a.exp = exp;
1724         return a;
1725     }
1726     /* handle all the NaN cases */
1727     if (is_nan(a.cls) || is_nan(b.cls)) {
1728         return pick_nan(a, b, s);
1729     }
1730     /* 0/0 or Inf/Inf */
1731     if (a.cls == b.cls
1732         &&
1733         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1734         s->float_exception_flags |= float_flag_invalid;
1735         return parts_default_nan(s);
1736     }
1737     /* Inf / x or 0 / x */
1738     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1739         a.sign = sign;
1740         return a;
1741     }
1742     /* Div 0 => Inf */
1743     if (b.cls == float_class_zero) {
1744         s->float_exception_flags |= float_flag_divbyzero;
1745         a.cls = float_class_inf;
1746         a.sign = sign;
1747         return a;
1748     }
1749     /* Div by Inf */
1750     if (b.cls == float_class_inf) {
1751         a.cls = float_class_zero;
1752         a.sign = sign;
1753         return a;
1754     }
1755     g_assert_not_reached();
1756 }
1757 
1758 float16 float16_div(float16 a, float16 b, float_status *status)
1759 {
1760     FloatParts pa = float16_unpack_canonical(a, status);
1761     FloatParts pb = float16_unpack_canonical(b, status);
1762     FloatParts pr = div_floats(pa, pb, status);
1763 
1764     return float16_round_pack_canonical(pr, status);
1765 }
1766 
1767 static float32 QEMU_SOFTFLOAT_ATTR
1768 soft_f32_div(float32 a, float32 b, float_status *status)
1769 {
1770     FloatParts pa = float32_unpack_canonical(a, status);
1771     FloatParts pb = float32_unpack_canonical(b, status);
1772     FloatParts pr = div_floats(pa, pb, status);
1773 
1774     return float32_round_pack_canonical(pr, status);
1775 }
1776 
1777 static float64 QEMU_SOFTFLOAT_ATTR
1778 soft_f64_div(float64 a, float64 b, float_status *status)
1779 {
1780     FloatParts pa = float64_unpack_canonical(a, status);
1781     FloatParts pb = float64_unpack_canonical(b, status);
1782     FloatParts pr = div_floats(pa, pb, status);
1783 
1784     return float64_round_pack_canonical(pr, status);
1785 }
1786 
1787 static float hard_f32_div(float a, float b)
1788 {
1789     return a / b;
1790 }
1791 
1792 static double hard_f64_div(double a, double b)
1793 {
1794     return a / b;
1795 }
1796 
1797 static bool f32_div_pre(union_float32 a, union_float32 b)
1798 {
1799     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1800         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1801                fpclassify(b.h) == FP_NORMAL;
1802     }
1803     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1804 }
1805 
1806 static bool f64_div_pre(union_float64 a, union_float64 b)
1807 {
1808     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1809         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1810                fpclassify(b.h) == FP_NORMAL;
1811     }
1812     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1813 }
1814 
1815 static bool f32_div_post(union_float32 a, union_float32 b)
1816 {
1817     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1818         return fpclassify(a.h) != FP_ZERO;
1819     }
1820     return !float32_is_zero(a.s);
1821 }
1822 
1823 static bool f64_div_post(union_float64 a, union_float64 b)
1824 {
1825     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1826         return fpclassify(a.h) != FP_ZERO;
1827     }
1828     return !float64_is_zero(a.s);
1829 }
1830 
1831 float32 QEMU_FLATTEN
1832 float32_div(float32 a, float32 b, float_status *s)
1833 {
1834     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1835                         f32_div_pre, f32_div_post, NULL, NULL);
1836 }
1837 
1838 float64 QEMU_FLATTEN
1839 float64_div(float64 a, float64 b, float_status *s)
1840 {
1841     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1842                         f64_div_pre, f64_div_post, NULL, NULL);
1843 }
1844 
1845 /*
1846  * Float to Float conversions
1847  *
1848  * Returns the result of converting one float format to another. The
1849  * conversion is performed according to the IEC/IEEE Standard for
1850  * Binary Floating-Point Arithmetic.
1851  *
1852  * The float_to_float helper only needs to take care of raising
1853  * invalid exceptions and handling the conversion on NaNs.
1854  */
1855 
1856 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1857                                  float_status *s)
1858 {
1859     if (dstf->arm_althp) {
1860         switch (a.cls) {
1861         case float_class_qnan:
1862         case float_class_snan:
1863             /* There is no NaN in the destination format.  Raise Invalid
1864              * and return a zero with the sign of the input NaN.
1865              */
1866             s->float_exception_flags |= float_flag_invalid;
1867             a.cls = float_class_zero;
1868             a.frac = 0;
1869             a.exp = 0;
1870             break;
1871 
1872         case float_class_inf:
1873             /* There is no Inf in the destination format.  Raise Invalid
1874              * and return the maximum normal with the correct sign.
1875              */
1876             s->float_exception_flags |= float_flag_invalid;
1877             a.cls = float_class_normal;
1878             a.exp = dstf->exp_max;
1879             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1880             break;
1881 
1882         default:
1883             break;
1884         }
1885     } else if (is_nan(a.cls)) {
1886         if (is_snan(a.cls)) {
1887             s->float_exception_flags |= float_flag_invalid;
1888             a = parts_silence_nan(a, s);
1889         }
1890         if (s->default_nan_mode) {
1891             return parts_default_nan(s);
1892         }
1893     }
1894     return a;
1895 }
1896 
1897 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1898 {
1899     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1900     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1901     FloatParts pr = float_to_float(p, &float32_params, s);
1902     return float32_round_pack_canonical(pr, s);
1903 }
1904 
1905 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1906 {
1907     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1908     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1909     FloatParts pr = float_to_float(p, &float64_params, s);
1910     return float64_round_pack_canonical(pr, s);
1911 }
1912 
1913 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1914 {
1915     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1916     FloatParts p = float32_unpack_canonical(a, s);
1917     FloatParts pr = float_to_float(p, fmt16, s);
1918     return float16a_round_pack_canonical(pr, s, fmt16);
1919 }
1920 
1921 float64 float32_to_float64(float32 a, float_status *s)
1922 {
1923     FloatParts p = float32_unpack_canonical(a, s);
1924     FloatParts pr = float_to_float(p, &float64_params, s);
1925     return float64_round_pack_canonical(pr, s);
1926 }
1927 
1928 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1929 {
1930     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1931     FloatParts p = float64_unpack_canonical(a, s);
1932     FloatParts pr = float_to_float(p, fmt16, s);
1933     return float16a_round_pack_canonical(pr, s, fmt16);
1934 }
1935 
1936 float32 float64_to_float32(float64 a, float_status *s)
1937 {
1938     FloatParts p = float64_unpack_canonical(a, s);
1939     FloatParts pr = float_to_float(p, &float32_params, s);
1940     return float32_round_pack_canonical(pr, s);
1941 }
1942 
1943 /*
1944  * Rounds the floating-point value `a' to an integer, and returns the
1945  * result as a floating-point value. The operation is performed
1946  * according to the IEC/IEEE Standard for Binary Floating-Point
1947  * Arithmetic.
1948  */
1949 
1950 static FloatParts round_to_int(FloatParts a, int rmode,
1951                                int scale, float_status *s)
1952 {
1953     switch (a.cls) {
1954     case float_class_qnan:
1955     case float_class_snan:
1956         return return_nan(a, s);
1957 
1958     case float_class_zero:
1959     case float_class_inf:
1960         /* already "integral" */
1961         break;
1962 
1963     case float_class_normal:
1964         scale = MIN(MAX(scale, -0x10000), 0x10000);
1965         a.exp += scale;
1966 
1967         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1968             /* already integral */
1969             break;
1970         }
1971         if (a.exp < 0) {
1972             bool one;
1973             /* all fractional */
1974             s->float_exception_flags |= float_flag_inexact;
1975             switch (rmode) {
1976             case float_round_nearest_even:
1977                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1978                 break;
1979             case float_round_ties_away:
1980                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1981                 break;
1982             case float_round_to_zero:
1983                 one = false;
1984                 break;
1985             case float_round_up:
1986                 one = !a.sign;
1987                 break;
1988             case float_round_down:
1989                 one = a.sign;
1990                 break;
1991             default:
1992                 g_assert_not_reached();
1993             }
1994 
1995             if (one) {
1996                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1997                 a.exp = 0;
1998             } else {
1999                 a.cls = float_class_zero;
2000             }
2001         } else {
2002             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2003             uint64_t frac_lsbm1 = frac_lsb >> 1;
2004             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2005             uint64_t rnd_mask = rnd_even_mask >> 1;
2006             uint64_t inc;
2007 
2008             switch (rmode) {
2009             case float_round_nearest_even:
2010                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2011                 break;
2012             case float_round_ties_away:
2013                 inc = frac_lsbm1;
2014                 break;
2015             case float_round_to_zero:
2016                 inc = 0;
2017                 break;
2018             case float_round_up:
2019                 inc = a.sign ? 0 : rnd_mask;
2020                 break;
2021             case float_round_down:
2022                 inc = a.sign ? rnd_mask : 0;
2023                 break;
2024             default:
2025                 g_assert_not_reached();
2026             }
2027 
2028             if (a.frac & rnd_mask) {
2029                 s->float_exception_flags |= float_flag_inexact;
2030                 a.frac += inc;
2031                 a.frac &= ~rnd_mask;
2032                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2033                     a.frac >>= 1;
2034                     a.exp++;
2035                 }
2036             }
2037         }
2038         break;
2039     default:
2040         g_assert_not_reached();
2041     }
2042     return a;
2043 }
2044 
2045 float16 float16_round_to_int(float16 a, float_status *s)
2046 {
2047     FloatParts pa = float16_unpack_canonical(a, s);
2048     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2049     return float16_round_pack_canonical(pr, s);
2050 }
2051 
2052 float32 float32_round_to_int(float32 a, float_status *s)
2053 {
2054     FloatParts pa = float32_unpack_canonical(a, s);
2055     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2056     return float32_round_pack_canonical(pr, s);
2057 }
2058 
2059 float64 float64_round_to_int(float64 a, float_status *s)
2060 {
2061     FloatParts pa = float64_unpack_canonical(a, s);
2062     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2063     return float64_round_pack_canonical(pr, s);
2064 }
2065 
2066 /*
2067  * Returns the result of converting the floating-point value `a' to
2068  * the two's complement integer format. The conversion is performed
2069  * according to the IEC/IEEE Standard for Binary Floating-Point
2070  * Arithmetic---which means in particular that the conversion is
2071  * rounded according to the current rounding mode. If `a' is a NaN,
2072  * the largest positive integer is returned. Otherwise, if the
2073  * conversion overflows, the largest integer with the same sign as `a'
2074  * is returned.
2075 */
2076 
2077 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2078                                      int64_t min, int64_t max,
2079                                      float_status *s)
2080 {
2081     uint64_t r;
2082     int orig_flags = get_float_exception_flags(s);
2083     FloatParts p = round_to_int(in, rmode, scale, s);
2084 
2085     switch (p.cls) {
2086     case float_class_snan:
2087     case float_class_qnan:
2088         s->float_exception_flags = orig_flags | float_flag_invalid;
2089         return max;
2090     case float_class_inf:
2091         s->float_exception_flags = orig_flags | float_flag_invalid;
2092         return p.sign ? min : max;
2093     case float_class_zero:
2094         return 0;
2095     case float_class_normal:
2096         if (p.exp < DECOMPOSED_BINARY_POINT) {
2097             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2098         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2099             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2100         } else {
2101             r = UINT64_MAX;
2102         }
2103         if (p.sign) {
2104             if (r <= -(uint64_t) min) {
2105                 return -r;
2106             } else {
2107                 s->float_exception_flags = orig_flags | float_flag_invalid;
2108                 return min;
2109             }
2110         } else {
2111             if (r <= max) {
2112                 return r;
2113             } else {
2114                 s->float_exception_flags = orig_flags | float_flag_invalid;
2115                 return max;
2116             }
2117         }
2118     default:
2119         g_assert_not_reached();
2120     }
2121 }
2122 
2123 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2124                                 float_status *s)
2125 {
2126     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2127                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2128 }
2129 
2130 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2131                                 float_status *s)
2132 {
2133     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2134                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2135 }
2136 
2137 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2138                                 float_status *s)
2139 {
2140     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2141                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2142 }
2143 
2144 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2145                                 float_status *s)
2146 {
2147     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2148                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2149 }
2150 
2151 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2152                                 float_status *s)
2153 {
2154     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2155                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2156 }
2157 
2158 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2159                                 float_status *s)
2160 {
2161     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2162                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2163 }
2164 
2165 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2166                                 float_status *s)
2167 {
2168     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2169                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2170 }
2171 
2172 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2173                                 float_status *s)
2174 {
2175     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2176                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2177 }
2178 
2179 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2180                                 float_status *s)
2181 {
2182     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2183                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2184 }
2185 
2186 int16_t float16_to_int16(float16 a, float_status *s)
2187 {
2188     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2189 }
2190 
2191 int32_t float16_to_int32(float16 a, float_status *s)
2192 {
2193     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2194 }
2195 
2196 int64_t float16_to_int64(float16 a, float_status *s)
2197 {
2198     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2199 }
2200 
2201 int16_t float32_to_int16(float32 a, float_status *s)
2202 {
2203     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2204 }
2205 
2206 int32_t float32_to_int32(float32 a, float_status *s)
2207 {
2208     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2209 }
2210 
2211 int64_t float32_to_int64(float32 a, float_status *s)
2212 {
2213     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2214 }
2215 
2216 int16_t float64_to_int16(float64 a, float_status *s)
2217 {
2218     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2219 }
2220 
2221 int32_t float64_to_int32(float64 a, float_status *s)
2222 {
2223     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2224 }
2225 
2226 int64_t float64_to_int64(float64 a, float_status *s)
2227 {
2228     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2229 }
2230 
2231 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2232 {
2233     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2234 }
2235 
2236 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2237 {
2238     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2239 }
2240 
2241 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2242 {
2243     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2244 }
2245 
2246 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2247 {
2248     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2249 }
2250 
2251 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2252 {
2253     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2254 }
2255 
2256 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2257 {
2258     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2259 }
2260 
2261 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2262 {
2263     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2264 }
2265 
2266 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2267 {
2268     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2269 }
2270 
2271 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2272 {
2273     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2274 }
2275 
2276 /*
2277  *  Returns the result of converting the floating-point value `a' to
2278  *  the unsigned integer format. The conversion is performed according
2279  *  to the IEC/IEEE Standard for Binary Floating-Point
2280  *  Arithmetic---which means in particular that the conversion is
2281  *  rounded according to the current rounding mode. If `a' is a NaN,
2282  *  the largest unsigned integer is returned. Otherwise, if the
2283  *  conversion overflows, the largest unsigned integer is returned. If
2284  *  the 'a' is negative, the result is rounded and zero is returned;
2285  *  values that do not round to zero will raise the inexact exception
2286  *  flag.
2287  */
2288 
2289 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2290                                        uint64_t max, float_status *s)
2291 {
2292     int orig_flags = get_float_exception_flags(s);
2293     FloatParts p = round_to_int(in, rmode, scale, s);
2294     uint64_t r;
2295 
2296     switch (p.cls) {
2297     case float_class_snan:
2298     case float_class_qnan:
2299         s->float_exception_flags = orig_flags | float_flag_invalid;
2300         return max;
2301     case float_class_inf:
2302         s->float_exception_flags = orig_flags | float_flag_invalid;
2303         return p.sign ? 0 : max;
2304     case float_class_zero:
2305         return 0;
2306     case float_class_normal:
2307         if (p.sign) {
2308             s->float_exception_flags = orig_flags | float_flag_invalid;
2309             return 0;
2310         }
2311 
2312         if (p.exp < DECOMPOSED_BINARY_POINT) {
2313             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2314         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2315             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2316         } else {
2317             s->float_exception_flags = orig_flags | float_flag_invalid;
2318             return max;
2319         }
2320 
2321         /* For uint64 this will never trip, but if p.exp is too large
2322          * to shift a decomposed fraction we shall have exited via the
2323          * 3rd leg above.
2324          */
2325         if (r > max) {
2326             s->float_exception_flags = orig_flags | float_flag_invalid;
2327             return max;
2328         }
2329         return r;
2330     default:
2331         g_assert_not_reached();
2332     }
2333 }
2334 
2335 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2336                                   float_status *s)
2337 {
2338     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2339                                   rmode, scale, UINT16_MAX, s);
2340 }
2341 
2342 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2343                                   float_status *s)
2344 {
2345     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2346                                   rmode, scale, UINT32_MAX, s);
2347 }
2348 
2349 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2350                                   float_status *s)
2351 {
2352     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2353                                   rmode, scale, UINT64_MAX, s);
2354 }
2355 
2356 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2357                                   float_status *s)
2358 {
2359     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2360                                   rmode, scale, UINT16_MAX, s);
2361 }
2362 
2363 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2364                                   float_status *s)
2365 {
2366     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2367                                   rmode, scale, UINT32_MAX, s);
2368 }
2369 
2370 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2371                                   float_status *s)
2372 {
2373     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2374                                   rmode, scale, UINT64_MAX, s);
2375 }
2376 
2377 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2378                                   float_status *s)
2379 {
2380     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2381                                   rmode, scale, UINT16_MAX, s);
2382 }
2383 
2384 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2385                                   float_status *s)
2386 {
2387     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2388                                   rmode, scale, UINT32_MAX, s);
2389 }
2390 
2391 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2392                                   float_status *s)
2393 {
2394     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2395                                   rmode, scale, UINT64_MAX, s);
2396 }
2397 
2398 uint16_t float16_to_uint16(float16 a, float_status *s)
2399 {
2400     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2401 }
2402 
2403 uint32_t float16_to_uint32(float16 a, float_status *s)
2404 {
2405     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2406 }
2407 
2408 uint64_t float16_to_uint64(float16 a, float_status *s)
2409 {
2410     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2411 }
2412 
2413 uint16_t float32_to_uint16(float32 a, float_status *s)
2414 {
2415     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2416 }
2417 
2418 uint32_t float32_to_uint32(float32 a, float_status *s)
2419 {
2420     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2421 }
2422 
2423 uint64_t float32_to_uint64(float32 a, float_status *s)
2424 {
2425     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint16_t float64_to_uint16(float64 a, float_status *s)
2429 {
2430     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint32_t float64_to_uint32(float64 a, float_status *s)
2434 {
2435     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2436 }
2437 
2438 uint64_t float64_to_uint64(float64 a, float_status *s)
2439 {
2440     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2441 }
2442 
2443 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2444 {
2445     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2446 }
2447 
2448 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2449 {
2450     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2451 }
2452 
2453 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2454 {
2455     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2456 }
2457 
2458 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2459 {
2460     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2461 }
2462 
2463 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2464 {
2465     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2466 }
2467 
2468 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2469 {
2470     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2474 {
2475     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2479 {
2480     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2481 }
2482 
2483 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2484 {
2485     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2486 }
2487 
2488 /*
2489  * Integer to float conversions
2490  *
2491  * Returns the result of converting the two's complement integer `a'
2492  * to the floating-point format. The conversion is performed according
2493  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2494  */
2495 
2496 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2497 {
2498     FloatParts r = { .sign = false };
2499 
2500     if (a == 0) {
2501         r.cls = float_class_zero;
2502     } else {
2503         uint64_t f = a;
2504         int shift;
2505 
2506         r.cls = float_class_normal;
2507         if (a < 0) {
2508             f = -f;
2509             r.sign = true;
2510         }
2511         shift = clz64(f) - 1;
2512         scale = MIN(MAX(scale, -0x10000), 0x10000);
2513 
2514         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2515         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2516     }
2517 
2518     return r;
2519 }
2520 
2521 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2522 {
2523     FloatParts pa = int_to_float(a, scale, status);
2524     return float16_round_pack_canonical(pa, status);
2525 }
2526 
2527 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2528 {
2529     return int64_to_float16_scalbn(a, scale, status);
2530 }
2531 
2532 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2533 {
2534     return int64_to_float16_scalbn(a, scale, status);
2535 }
2536 
2537 float16 int64_to_float16(int64_t a, float_status *status)
2538 {
2539     return int64_to_float16_scalbn(a, 0, status);
2540 }
2541 
2542 float16 int32_to_float16(int32_t a, float_status *status)
2543 {
2544     return int64_to_float16_scalbn(a, 0, status);
2545 }
2546 
2547 float16 int16_to_float16(int16_t a, float_status *status)
2548 {
2549     return int64_to_float16_scalbn(a, 0, status);
2550 }
2551 
2552 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2553 {
2554     FloatParts pa = int_to_float(a, scale, status);
2555     return float32_round_pack_canonical(pa, status);
2556 }
2557 
2558 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2559 {
2560     return int64_to_float32_scalbn(a, scale, status);
2561 }
2562 
2563 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2564 {
2565     return int64_to_float32_scalbn(a, scale, status);
2566 }
2567 
2568 float32 int64_to_float32(int64_t a, float_status *status)
2569 {
2570     return int64_to_float32_scalbn(a, 0, status);
2571 }
2572 
2573 float32 int32_to_float32(int32_t a, float_status *status)
2574 {
2575     return int64_to_float32_scalbn(a, 0, status);
2576 }
2577 
2578 float32 int16_to_float32(int16_t a, float_status *status)
2579 {
2580     return int64_to_float32_scalbn(a, 0, status);
2581 }
2582 
2583 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2584 {
2585     FloatParts pa = int_to_float(a, scale, status);
2586     return float64_round_pack_canonical(pa, status);
2587 }
2588 
2589 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2590 {
2591     return int64_to_float64_scalbn(a, scale, status);
2592 }
2593 
2594 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2595 {
2596     return int64_to_float64_scalbn(a, scale, status);
2597 }
2598 
2599 float64 int64_to_float64(int64_t a, float_status *status)
2600 {
2601     return int64_to_float64_scalbn(a, 0, status);
2602 }
2603 
2604 float64 int32_to_float64(int32_t a, float_status *status)
2605 {
2606     return int64_to_float64_scalbn(a, 0, status);
2607 }
2608 
2609 float64 int16_to_float64(int16_t a, float_status *status)
2610 {
2611     return int64_to_float64_scalbn(a, 0, status);
2612 }
2613 
2614 
2615 /*
2616  * Unsigned Integer to float conversions
2617  *
2618  * Returns the result of converting the unsigned integer `a' to the
2619  * floating-point format. The conversion is performed according to the
2620  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2621  */
2622 
2623 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2624 {
2625     FloatParts r = { .sign = false };
2626 
2627     if (a == 0) {
2628         r.cls = float_class_zero;
2629     } else {
2630         scale = MIN(MAX(scale, -0x10000), 0x10000);
2631         r.cls = float_class_normal;
2632         if ((int64_t)a < 0) {
2633             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2634             shift64RightJamming(a, 1, &a);
2635             r.frac = a;
2636         } else {
2637             int shift = clz64(a) - 1;
2638             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2639             r.frac = a << shift;
2640         }
2641     }
2642 
2643     return r;
2644 }
2645 
2646 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2647 {
2648     FloatParts pa = uint_to_float(a, scale, status);
2649     return float16_round_pack_canonical(pa, status);
2650 }
2651 
2652 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2653 {
2654     return uint64_to_float16_scalbn(a, scale, status);
2655 }
2656 
2657 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2658 {
2659     return uint64_to_float16_scalbn(a, scale, status);
2660 }
2661 
2662 float16 uint64_to_float16(uint64_t a, float_status *status)
2663 {
2664     return uint64_to_float16_scalbn(a, 0, status);
2665 }
2666 
2667 float16 uint32_to_float16(uint32_t a, float_status *status)
2668 {
2669     return uint64_to_float16_scalbn(a, 0, status);
2670 }
2671 
2672 float16 uint16_to_float16(uint16_t a, float_status *status)
2673 {
2674     return uint64_to_float16_scalbn(a, 0, status);
2675 }
2676 
2677 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2678 {
2679     FloatParts pa = uint_to_float(a, scale, status);
2680     return float32_round_pack_canonical(pa, status);
2681 }
2682 
2683 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2684 {
2685     return uint64_to_float32_scalbn(a, scale, status);
2686 }
2687 
2688 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2689 {
2690     return uint64_to_float32_scalbn(a, scale, status);
2691 }
2692 
2693 float32 uint64_to_float32(uint64_t a, float_status *status)
2694 {
2695     return uint64_to_float32_scalbn(a, 0, status);
2696 }
2697 
2698 float32 uint32_to_float32(uint32_t a, float_status *status)
2699 {
2700     return uint64_to_float32_scalbn(a, 0, status);
2701 }
2702 
2703 float32 uint16_to_float32(uint16_t a, float_status *status)
2704 {
2705     return uint64_to_float32_scalbn(a, 0, status);
2706 }
2707 
2708 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2709 {
2710     FloatParts pa = uint_to_float(a, scale, status);
2711     return float64_round_pack_canonical(pa, status);
2712 }
2713 
2714 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2715 {
2716     return uint64_to_float64_scalbn(a, scale, status);
2717 }
2718 
2719 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2720 {
2721     return uint64_to_float64_scalbn(a, scale, status);
2722 }
2723 
2724 float64 uint64_to_float64(uint64_t a, float_status *status)
2725 {
2726     return uint64_to_float64_scalbn(a, 0, status);
2727 }
2728 
2729 float64 uint32_to_float64(uint32_t a, float_status *status)
2730 {
2731     return uint64_to_float64_scalbn(a, 0, status);
2732 }
2733 
2734 float64 uint16_to_float64(uint16_t a, float_status *status)
2735 {
2736     return uint64_to_float64_scalbn(a, 0, status);
2737 }
2738 
2739 /* Float Min/Max */
2740 /* min() and max() functions. These can't be implemented as
2741  * 'compare and pick one input' because that would mishandle
2742  * NaNs and +0 vs -0.
2743  *
2744  * minnum() and maxnum() functions. These are similar to the min()
2745  * and max() functions but if one of the arguments is a QNaN and
2746  * the other is numerical then the numerical argument is returned.
2747  * SNaNs will get quietened before being returned.
2748  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2749  * and maxNum() operations. min() and max() are the typical min/max
2750  * semantics provided by many CPUs which predate that specification.
2751  *
2752  * minnummag() and maxnummag() functions correspond to minNumMag()
2753  * and minNumMag() from the IEEE-754 2008.
2754  */
2755 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2756                                 bool ieee, bool ismag, float_status *s)
2757 {
2758     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2759         if (ieee) {
2760             /* Takes two floating-point values `a' and `b', one of
2761              * which is a NaN, and returns the appropriate NaN
2762              * result. If either `a' or `b' is a signaling NaN,
2763              * the invalid exception is raised.
2764              */
2765             if (is_snan(a.cls) || is_snan(b.cls)) {
2766                 return pick_nan(a, b, s);
2767             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2768                 return b;
2769             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2770                 return a;
2771             }
2772         }
2773         return pick_nan(a, b, s);
2774     } else {
2775         int a_exp, b_exp;
2776 
2777         switch (a.cls) {
2778         case float_class_normal:
2779             a_exp = a.exp;
2780             break;
2781         case float_class_inf:
2782             a_exp = INT_MAX;
2783             break;
2784         case float_class_zero:
2785             a_exp = INT_MIN;
2786             break;
2787         default:
2788             g_assert_not_reached();
2789             break;
2790         }
2791         switch (b.cls) {
2792         case float_class_normal:
2793             b_exp = b.exp;
2794             break;
2795         case float_class_inf:
2796             b_exp = INT_MAX;
2797             break;
2798         case float_class_zero:
2799             b_exp = INT_MIN;
2800             break;
2801         default:
2802             g_assert_not_reached();
2803             break;
2804         }
2805 
2806         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2807             bool a_less = a_exp < b_exp;
2808             if (a_exp == b_exp) {
2809                 a_less = a.frac < b.frac;
2810             }
2811             return a_less ^ ismin ? b : a;
2812         }
2813 
2814         if (a.sign == b.sign) {
2815             bool a_less = a_exp < b_exp;
2816             if (a_exp == b_exp) {
2817                 a_less = a.frac < b.frac;
2818             }
2819             return a.sign ^ a_less ^ ismin ? b : a;
2820         } else {
2821             return a.sign ^ ismin ? b : a;
2822         }
2823     }
2824 }
2825 
2826 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2827 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2828                                      float_status *s)                   \
2829 {                                                                       \
2830     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2831     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2832     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2833                                                                         \
2834     return float ## sz ## _round_pack_canonical(pr, s);                 \
2835 }
2836 
2837 MINMAX(16, min, true, false, false)
2838 MINMAX(16, minnum, true, true, false)
2839 MINMAX(16, minnummag, true, true, true)
2840 MINMAX(16, max, false, false, false)
2841 MINMAX(16, maxnum, false, true, false)
2842 MINMAX(16, maxnummag, false, true, true)
2843 
2844 MINMAX(32, min, true, false, false)
2845 MINMAX(32, minnum, true, true, false)
2846 MINMAX(32, minnummag, true, true, true)
2847 MINMAX(32, max, false, false, false)
2848 MINMAX(32, maxnum, false, true, false)
2849 MINMAX(32, maxnummag, false, true, true)
2850 
2851 MINMAX(64, min, true, false, false)
2852 MINMAX(64, minnum, true, true, false)
2853 MINMAX(64, minnummag, true, true, true)
2854 MINMAX(64, max, false, false, false)
2855 MINMAX(64, maxnum, false, true, false)
2856 MINMAX(64, maxnummag, false, true, true)
2857 
2858 #undef MINMAX
2859 
2860 /* Floating point compare */
2861 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2862                           float_status *s)
2863 {
2864     if (is_nan(a.cls) || is_nan(b.cls)) {
2865         if (!is_quiet ||
2866             a.cls == float_class_snan ||
2867             b.cls == float_class_snan) {
2868             s->float_exception_flags |= float_flag_invalid;
2869         }
2870         return float_relation_unordered;
2871     }
2872 
2873     if (a.cls == float_class_zero) {
2874         if (b.cls == float_class_zero) {
2875             return float_relation_equal;
2876         }
2877         return b.sign ? float_relation_greater : float_relation_less;
2878     } else if (b.cls == float_class_zero) {
2879         return a.sign ? float_relation_less : float_relation_greater;
2880     }
2881 
2882     /* The only really important thing about infinity is its sign. If
2883      * both are infinities the sign marks the smallest of the two.
2884      */
2885     if (a.cls == float_class_inf) {
2886         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2887             return float_relation_equal;
2888         }
2889         return a.sign ? float_relation_less : float_relation_greater;
2890     } else if (b.cls == float_class_inf) {
2891         return b.sign ? float_relation_greater : float_relation_less;
2892     }
2893 
2894     if (a.sign != b.sign) {
2895         return a.sign ? float_relation_less : float_relation_greater;
2896     }
2897 
2898     if (a.exp == b.exp) {
2899         if (a.frac == b.frac) {
2900             return float_relation_equal;
2901         }
2902         if (a.sign) {
2903             return a.frac > b.frac ?
2904                 float_relation_less : float_relation_greater;
2905         } else {
2906             return a.frac > b.frac ?
2907                 float_relation_greater : float_relation_less;
2908         }
2909     } else {
2910         if (a.sign) {
2911             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2912         } else {
2913             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2914         }
2915     }
2916 }
2917 
2918 #define COMPARE(name, attr, sz)                                         \
2919 static int attr                                                         \
2920 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2921 {                                                                       \
2922     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2923     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2924     return compare_floats(pa, pb, is_quiet, s);                         \
2925 }
2926 
2927 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2928 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2929 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2930 
2931 #undef COMPARE
2932 
2933 int float16_compare(float16 a, float16 b, float_status *s)
2934 {
2935     return soft_f16_compare(a, b, false, s);
2936 }
2937 
2938 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2939 {
2940     return soft_f16_compare(a, b, true, s);
2941 }
2942 
2943 static int QEMU_FLATTEN
2944 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2945 {
2946     union_float32 ua, ub;
2947 
2948     ua.s = xa;
2949     ub.s = xb;
2950 
2951     if (QEMU_NO_HARDFLOAT) {
2952         goto soft;
2953     }
2954 
2955     float32_input_flush2(&ua.s, &ub.s, s);
2956     if (isgreaterequal(ua.h, ub.h)) {
2957         if (isgreater(ua.h, ub.h)) {
2958             return float_relation_greater;
2959         }
2960         return float_relation_equal;
2961     }
2962     if (likely(isless(ua.h, ub.h))) {
2963         return float_relation_less;
2964     }
2965     /* The only condition remaining is unordered.
2966      * Fall through to set flags.
2967      */
2968  soft:
2969     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2970 }
2971 
2972 int float32_compare(float32 a, float32 b, float_status *s)
2973 {
2974     return f32_compare(a, b, false, s);
2975 }
2976 
2977 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2978 {
2979     return f32_compare(a, b, true, s);
2980 }
2981 
2982 static int QEMU_FLATTEN
2983 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
2984 {
2985     union_float64 ua, ub;
2986 
2987     ua.s = xa;
2988     ub.s = xb;
2989 
2990     if (QEMU_NO_HARDFLOAT) {
2991         goto soft;
2992     }
2993 
2994     float64_input_flush2(&ua.s, &ub.s, s);
2995     if (isgreaterequal(ua.h, ub.h)) {
2996         if (isgreater(ua.h, ub.h)) {
2997             return float_relation_greater;
2998         }
2999         return float_relation_equal;
3000     }
3001     if (likely(isless(ua.h, ub.h))) {
3002         return float_relation_less;
3003     }
3004     /* The only condition remaining is unordered.
3005      * Fall through to set flags.
3006      */
3007  soft:
3008     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3009 }
3010 
3011 int float64_compare(float64 a, float64 b, float_status *s)
3012 {
3013     return f64_compare(a, b, false, s);
3014 }
3015 
3016 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3017 {
3018     return f64_compare(a, b, true, s);
3019 }
3020 
3021 /* Multiply A by 2 raised to the power N.  */
3022 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3023 {
3024     if (unlikely(is_nan(a.cls))) {
3025         return return_nan(a, s);
3026     }
3027     if (a.cls == float_class_normal) {
3028         /* The largest float type (even though not supported by FloatParts)
3029          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3030          * still allows rounding to infinity, without allowing overflow
3031          * within the int32_t that backs FloatParts.exp.
3032          */
3033         n = MIN(MAX(n, -0x10000), 0x10000);
3034         a.exp += n;
3035     }
3036     return a;
3037 }
3038 
3039 float16 float16_scalbn(float16 a, int n, float_status *status)
3040 {
3041     FloatParts pa = float16_unpack_canonical(a, status);
3042     FloatParts pr = scalbn_decomposed(pa, n, status);
3043     return float16_round_pack_canonical(pr, status);
3044 }
3045 
3046 float32 float32_scalbn(float32 a, int n, float_status *status)
3047 {
3048     FloatParts pa = float32_unpack_canonical(a, status);
3049     FloatParts pr = scalbn_decomposed(pa, n, status);
3050     return float32_round_pack_canonical(pr, status);
3051 }
3052 
3053 float64 float64_scalbn(float64 a, int n, float_status *status)
3054 {
3055     FloatParts pa = float64_unpack_canonical(a, status);
3056     FloatParts pr = scalbn_decomposed(pa, n, status);
3057     return float64_round_pack_canonical(pr, status);
3058 }
3059 
3060 /*
3061  * Square Root
3062  *
3063  * The old softfloat code did an approximation step before zeroing in
3064  * on the final result. However for simpleness we just compute the
3065  * square root by iterating down from the implicit bit to enough extra
3066  * bits to ensure we get a correctly rounded result.
3067  *
3068  * This does mean however the calculation is slower than before,
3069  * especially for 64 bit floats.
3070  */
3071 
3072 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3073 {
3074     uint64_t a_frac, r_frac, s_frac;
3075     int bit, last_bit;
3076 
3077     if (is_nan(a.cls)) {
3078         return return_nan(a, s);
3079     }
3080     if (a.cls == float_class_zero) {
3081         return a;  /* sqrt(+-0) = +-0 */
3082     }
3083     if (a.sign) {
3084         s->float_exception_flags |= float_flag_invalid;
3085         return parts_default_nan(s);
3086     }
3087     if (a.cls == float_class_inf) {
3088         return a;  /* sqrt(+inf) = +inf */
3089     }
3090 
3091     assert(a.cls == float_class_normal);
3092 
3093     /* We need two overflow bits at the top. Adding room for that is a
3094      * right shift. If the exponent is odd, we can discard the low bit
3095      * by multiplying the fraction by 2; that's a left shift. Combine
3096      * those and we shift right if the exponent is even.
3097      */
3098     a_frac = a.frac;
3099     if (!(a.exp & 1)) {
3100         a_frac >>= 1;
3101     }
3102     a.exp >>= 1;
3103 
3104     /* Bit-by-bit computation of sqrt.  */
3105     r_frac = 0;
3106     s_frac = 0;
3107 
3108     /* Iterate from implicit bit down to the 3 extra bits to compute a
3109      * properly rounded result. Remember we've inserted one more bit
3110      * at the top, so these positions are one less.
3111      */
3112     bit = DECOMPOSED_BINARY_POINT - 1;
3113     last_bit = MAX(p->frac_shift - 4, 0);
3114     do {
3115         uint64_t q = 1ULL << bit;
3116         uint64_t t_frac = s_frac + q;
3117         if (t_frac <= a_frac) {
3118             s_frac = t_frac + q;
3119             a_frac -= t_frac;
3120             r_frac += q;
3121         }
3122         a_frac <<= 1;
3123     } while (--bit >= last_bit);
3124 
3125     /* Undo the right shift done above. If there is any remaining
3126      * fraction, the result is inexact. Set the sticky bit.
3127      */
3128     a.frac = (r_frac << 1) + (a_frac != 0);
3129 
3130     return a;
3131 }
3132 
3133 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3134 {
3135     FloatParts pa = float16_unpack_canonical(a, status);
3136     FloatParts pr = sqrt_float(pa, status, &float16_params);
3137     return float16_round_pack_canonical(pr, status);
3138 }
3139 
3140 static float32 QEMU_SOFTFLOAT_ATTR
3141 soft_f32_sqrt(float32 a, float_status *status)
3142 {
3143     FloatParts pa = float32_unpack_canonical(a, status);
3144     FloatParts pr = sqrt_float(pa, status, &float32_params);
3145     return float32_round_pack_canonical(pr, status);
3146 }
3147 
3148 static float64 QEMU_SOFTFLOAT_ATTR
3149 soft_f64_sqrt(float64 a, float_status *status)
3150 {
3151     FloatParts pa = float64_unpack_canonical(a, status);
3152     FloatParts pr = sqrt_float(pa, status, &float64_params);
3153     return float64_round_pack_canonical(pr, status);
3154 }
3155 
3156 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3157 {
3158     union_float32 ua, ur;
3159 
3160     ua.s = xa;
3161     if (unlikely(!can_use_fpu(s))) {
3162         goto soft;
3163     }
3164 
3165     float32_input_flush1(&ua.s, s);
3166     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3167         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3168                        fpclassify(ua.h) == FP_ZERO) ||
3169                      signbit(ua.h))) {
3170             goto soft;
3171         }
3172     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3173                         float32_is_neg(ua.s))) {
3174         goto soft;
3175     }
3176     ur.h = sqrtf(ua.h);
3177     return ur.s;
3178 
3179  soft:
3180     return soft_f32_sqrt(ua.s, s);
3181 }
3182 
3183 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3184 {
3185     union_float64 ua, ur;
3186 
3187     ua.s = xa;
3188     if (unlikely(!can_use_fpu(s))) {
3189         goto soft;
3190     }
3191 
3192     float64_input_flush1(&ua.s, s);
3193     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3194         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3195                        fpclassify(ua.h) == FP_ZERO) ||
3196                      signbit(ua.h))) {
3197             goto soft;
3198         }
3199     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3200                         float64_is_neg(ua.s))) {
3201         goto soft;
3202     }
3203     ur.h = sqrt(ua.h);
3204     return ur.s;
3205 
3206  soft:
3207     return soft_f64_sqrt(ua.s, s);
3208 }
3209 
3210 /*----------------------------------------------------------------------------
3211 | The pattern for a default generated NaN.
3212 *----------------------------------------------------------------------------*/
3213 
3214 float16 float16_default_nan(float_status *status)
3215 {
3216     FloatParts p = parts_default_nan(status);
3217     p.frac >>= float16_params.frac_shift;
3218     return float16_pack_raw(p);
3219 }
3220 
3221 float32 float32_default_nan(float_status *status)
3222 {
3223     FloatParts p = parts_default_nan(status);
3224     p.frac >>= float32_params.frac_shift;
3225     return float32_pack_raw(p);
3226 }
3227 
3228 float64 float64_default_nan(float_status *status)
3229 {
3230     FloatParts p = parts_default_nan(status);
3231     p.frac >>= float64_params.frac_shift;
3232     return float64_pack_raw(p);
3233 }
3234 
3235 float128 float128_default_nan(float_status *status)
3236 {
3237     FloatParts p = parts_default_nan(status);
3238     float128 r;
3239 
3240     /* Extrapolate from the choices made by parts_default_nan to fill
3241      * in the quad-floating format.  If the low bit is set, assume we
3242      * want to set all non-snan bits.
3243      */
3244     r.low = -(p.frac & 1);
3245     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3246     r.high |= LIT64(0x7FFF000000000000);
3247     r.high |= (uint64_t)p.sign << 63;
3248 
3249     return r;
3250 }
3251 
3252 /*----------------------------------------------------------------------------
3253 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3254 *----------------------------------------------------------------------------*/
3255 
3256 float16 float16_silence_nan(float16 a, float_status *status)
3257 {
3258     FloatParts p = float16_unpack_raw(a);
3259     p.frac <<= float16_params.frac_shift;
3260     p = parts_silence_nan(p, status);
3261     p.frac >>= float16_params.frac_shift;
3262     return float16_pack_raw(p);
3263 }
3264 
3265 float32 float32_silence_nan(float32 a, float_status *status)
3266 {
3267     FloatParts p = float32_unpack_raw(a);
3268     p.frac <<= float32_params.frac_shift;
3269     p = parts_silence_nan(p, status);
3270     p.frac >>= float32_params.frac_shift;
3271     return float32_pack_raw(p);
3272 }
3273 
3274 float64 float64_silence_nan(float64 a, float_status *status)
3275 {
3276     FloatParts p = float64_unpack_raw(a);
3277     p.frac <<= float64_params.frac_shift;
3278     p = parts_silence_nan(p, status);
3279     p.frac >>= float64_params.frac_shift;
3280     return float64_pack_raw(p);
3281 }
3282 
3283 /*----------------------------------------------------------------------------
3284 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3285 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3286 | input.  If `zSign' is 1, the input is negated before being converted to an
3287 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3288 | is simply rounded to an integer, with the inexact exception raised if the
3289 | input cannot be represented exactly as an integer.  However, if the fixed-
3290 | point input is too large, the invalid exception is raised and the largest
3291 | positive or negative integer is returned.
3292 *----------------------------------------------------------------------------*/
3293 
3294 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3295 {
3296     int8_t roundingMode;
3297     flag roundNearestEven;
3298     int8_t roundIncrement, roundBits;
3299     int32_t z;
3300 
3301     roundingMode = status->float_rounding_mode;
3302     roundNearestEven = ( roundingMode == float_round_nearest_even );
3303     switch (roundingMode) {
3304     case float_round_nearest_even:
3305     case float_round_ties_away:
3306         roundIncrement = 0x40;
3307         break;
3308     case float_round_to_zero:
3309         roundIncrement = 0;
3310         break;
3311     case float_round_up:
3312         roundIncrement = zSign ? 0 : 0x7f;
3313         break;
3314     case float_round_down:
3315         roundIncrement = zSign ? 0x7f : 0;
3316         break;
3317     default:
3318         abort();
3319     }
3320     roundBits = absZ & 0x7F;
3321     absZ = ( absZ + roundIncrement )>>7;
3322     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3323     z = absZ;
3324     if ( zSign ) z = - z;
3325     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3326         float_raise(float_flag_invalid, status);
3327         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3328     }
3329     if (roundBits) {
3330         status->float_exception_flags |= float_flag_inexact;
3331     }
3332     return z;
3333 
3334 }
3335 
3336 /*----------------------------------------------------------------------------
3337 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3338 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3339 | and returns the properly rounded 64-bit integer corresponding to the input.
3340 | If `zSign' is 1, the input is negated before being converted to an integer.
3341 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3342 | the inexact exception raised if the input cannot be represented exactly as
3343 | an integer.  However, if the fixed-point input is too large, the invalid
3344 | exception is raised and the largest positive or negative integer is
3345 | returned.
3346 *----------------------------------------------------------------------------*/
3347 
3348 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3349                                float_status *status)
3350 {
3351     int8_t roundingMode;
3352     flag roundNearestEven, increment;
3353     int64_t z;
3354 
3355     roundingMode = status->float_rounding_mode;
3356     roundNearestEven = ( roundingMode == float_round_nearest_even );
3357     switch (roundingMode) {
3358     case float_round_nearest_even:
3359     case float_round_ties_away:
3360         increment = ((int64_t) absZ1 < 0);
3361         break;
3362     case float_round_to_zero:
3363         increment = 0;
3364         break;
3365     case float_round_up:
3366         increment = !zSign && absZ1;
3367         break;
3368     case float_round_down:
3369         increment = zSign && absZ1;
3370         break;
3371     default:
3372         abort();
3373     }
3374     if ( increment ) {
3375         ++absZ0;
3376         if ( absZ0 == 0 ) goto overflow;
3377         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3378     }
3379     z = absZ0;
3380     if ( zSign ) z = - z;
3381     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3382  overflow:
3383         float_raise(float_flag_invalid, status);
3384         return
3385               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3386             : LIT64( 0x7FFFFFFFFFFFFFFF );
3387     }
3388     if (absZ1) {
3389         status->float_exception_flags |= float_flag_inexact;
3390     }
3391     return z;
3392 
3393 }
3394 
3395 /*----------------------------------------------------------------------------
3396 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3397 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3398 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3399 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3400 | with the inexact exception raised if the input cannot be represented exactly
3401 | as an integer.  However, if the fixed-point input is too large, the invalid
3402 | exception is raised and the largest unsigned integer is returned.
3403 *----------------------------------------------------------------------------*/
3404 
3405 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3406                                 uint64_t absZ1, float_status *status)
3407 {
3408     int8_t roundingMode;
3409     flag roundNearestEven, increment;
3410 
3411     roundingMode = status->float_rounding_mode;
3412     roundNearestEven = (roundingMode == float_round_nearest_even);
3413     switch (roundingMode) {
3414     case float_round_nearest_even:
3415     case float_round_ties_away:
3416         increment = ((int64_t)absZ1 < 0);
3417         break;
3418     case float_round_to_zero:
3419         increment = 0;
3420         break;
3421     case float_round_up:
3422         increment = !zSign && absZ1;
3423         break;
3424     case float_round_down:
3425         increment = zSign && absZ1;
3426         break;
3427     default:
3428         abort();
3429     }
3430     if (increment) {
3431         ++absZ0;
3432         if (absZ0 == 0) {
3433             float_raise(float_flag_invalid, status);
3434             return LIT64(0xFFFFFFFFFFFFFFFF);
3435         }
3436         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3437     }
3438 
3439     if (zSign && absZ0) {
3440         float_raise(float_flag_invalid, status);
3441         return 0;
3442     }
3443 
3444     if (absZ1) {
3445         status->float_exception_flags |= float_flag_inexact;
3446     }
3447     return absZ0;
3448 }
3449 
3450 /*----------------------------------------------------------------------------
3451 | If `a' is denormal and we are in flush-to-zero mode then set the
3452 | input-denormal exception and return zero. Otherwise just return the value.
3453 *----------------------------------------------------------------------------*/
3454 float32 float32_squash_input_denormal(float32 a, float_status *status)
3455 {
3456     if (status->flush_inputs_to_zero) {
3457         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3458             float_raise(float_flag_input_denormal, status);
3459             return make_float32(float32_val(a) & 0x80000000);
3460         }
3461     }
3462     return a;
3463 }
3464 
3465 /*----------------------------------------------------------------------------
3466 | Normalizes the subnormal single-precision floating-point value represented
3467 | by the denormalized significand `aSig'.  The normalized exponent and
3468 | significand are stored at the locations pointed to by `zExpPtr' and
3469 | `zSigPtr', respectively.
3470 *----------------------------------------------------------------------------*/
3471 
3472 static void
3473  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3474 {
3475     int8_t shiftCount;
3476 
3477     shiftCount = clz32(aSig) - 8;
3478     *zSigPtr = aSig<<shiftCount;
3479     *zExpPtr = 1 - shiftCount;
3480 
3481 }
3482 
3483 /*----------------------------------------------------------------------------
3484 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3485 | and significand `zSig', and returns the proper single-precision floating-
3486 | point value corresponding to the abstract input.  Ordinarily, the abstract
3487 | value is simply rounded and packed into the single-precision format, with
3488 | the inexact exception raised if the abstract input cannot be represented
3489 | exactly.  However, if the abstract value is too large, the overflow and
3490 | inexact exceptions are raised and an infinity or maximal finite value is
3491 | returned.  If the abstract value is too small, the input value is rounded to
3492 | a subnormal number, and the underflow and inexact exceptions are raised if
3493 | the abstract input cannot be represented exactly as a subnormal single-
3494 | precision floating-point number.
3495 |     The input significand `zSig' has its binary point between bits 30
3496 | and 29, which is 7 bits to the left of the usual location.  This shifted
3497 | significand must be normalized or smaller.  If `zSig' is not normalized,
3498 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3499 | and it must not require rounding.  In the usual case that `zSig' is
3500 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3501 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3502 | Binary Floating-Point Arithmetic.
3503 *----------------------------------------------------------------------------*/
3504 
3505 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3506                                    float_status *status)
3507 {
3508     int8_t roundingMode;
3509     flag roundNearestEven;
3510     int8_t roundIncrement, roundBits;
3511     flag isTiny;
3512 
3513     roundingMode = status->float_rounding_mode;
3514     roundNearestEven = ( roundingMode == float_round_nearest_even );
3515     switch (roundingMode) {
3516     case float_round_nearest_even:
3517     case float_round_ties_away:
3518         roundIncrement = 0x40;
3519         break;
3520     case float_round_to_zero:
3521         roundIncrement = 0;
3522         break;
3523     case float_round_up:
3524         roundIncrement = zSign ? 0 : 0x7f;
3525         break;
3526     case float_round_down:
3527         roundIncrement = zSign ? 0x7f : 0;
3528         break;
3529     default:
3530         abort();
3531         break;
3532     }
3533     roundBits = zSig & 0x7F;
3534     if ( 0xFD <= (uint16_t) zExp ) {
3535         if (    ( 0xFD < zExp )
3536              || (    ( zExp == 0xFD )
3537                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3538            ) {
3539             float_raise(float_flag_overflow | float_flag_inexact, status);
3540             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
3541         }
3542         if ( zExp < 0 ) {
3543             if (status->flush_to_zero) {
3544                 float_raise(float_flag_output_denormal, status);
3545                 return packFloat32(zSign, 0, 0);
3546             }
3547             isTiny =
3548                 (status->float_detect_tininess
3549                  == float_tininess_before_rounding)
3550                 || ( zExp < -1 )
3551                 || ( zSig + roundIncrement < 0x80000000 );
3552             shift32RightJamming( zSig, - zExp, &zSig );
3553             zExp = 0;
3554             roundBits = zSig & 0x7F;
3555             if (isTiny && roundBits) {
3556                 float_raise(float_flag_underflow, status);
3557             }
3558         }
3559     }
3560     if (roundBits) {
3561         status->float_exception_flags |= float_flag_inexact;
3562     }
3563     zSig = ( zSig + roundIncrement )>>7;
3564     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3565     if ( zSig == 0 ) zExp = 0;
3566     return packFloat32( zSign, zExp, zSig );
3567 
3568 }
3569 
3570 /*----------------------------------------------------------------------------
3571 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3572 | and significand `zSig', and returns the proper single-precision floating-
3573 | point value corresponding to the abstract input.  This routine is just like
3574 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3575 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3576 | floating-point exponent.
3577 *----------------------------------------------------------------------------*/
3578 
3579 static float32
3580  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3581                               float_status *status)
3582 {
3583     int8_t shiftCount;
3584 
3585     shiftCount = clz32(zSig) - 1;
3586     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3587                                status);
3588 
3589 }
3590 
3591 /*----------------------------------------------------------------------------
3592 | If `a' is denormal and we are in flush-to-zero mode then set the
3593 | input-denormal exception and return zero. Otherwise just return the value.
3594 *----------------------------------------------------------------------------*/
3595 float64 float64_squash_input_denormal(float64 a, float_status *status)
3596 {
3597     if (status->flush_inputs_to_zero) {
3598         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3599             float_raise(float_flag_input_denormal, status);
3600             return make_float64(float64_val(a) & (1ULL << 63));
3601         }
3602     }
3603     return a;
3604 }
3605 
3606 /*----------------------------------------------------------------------------
3607 | Normalizes the subnormal double-precision floating-point value represented
3608 | by the denormalized significand `aSig'.  The normalized exponent and
3609 | significand are stored at the locations pointed to by `zExpPtr' and
3610 | `zSigPtr', respectively.
3611 *----------------------------------------------------------------------------*/
3612 
3613 static void
3614  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3615 {
3616     int8_t shiftCount;
3617 
3618     shiftCount = clz64(aSig) - 11;
3619     *zSigPtr = aSig<<shiftCount;
3620     *zExpPtr = 1 - shiftCount;
3621 
3622 }
3623 
3624 /*----------------------------------------------------------------------------
3625 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3626 | double-precision floating-point value, returning the result.  After being
3627 | shifted into the proper positions, the three fields are simply added
3628 | together to form the result.  This means that any integer portion of `zSig'
3629 | will be added into the exponent.  Since a properly normalized significand
3630 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3631 | than the desired result exponent whenever `zSig' is a complete, normalized
3632 | significand.
3633 *----------------------------------------------------------------------------*/
3634 
3635 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3636 {
3637 
3638     return make_float64(
3639         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3640 
3641 }
3642 
3643 /*----------------------------------------------------------------------------
3644 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3645 | and significand `zSig', and returns the proper double-precision floating-
3646 | point value corresponding to the abstract input.  Ordinarily, the abstract
3647 | value is simply rounded and packed into the double-precision format, with
3648 | the inexact exception raised if the abstract input cannot be represented
3649 | exactly.  However, if the abstract value is too large, the overflow and
3650 | inexact exceptions are raised and an infinity or maximal finite value is
3651 | returned.  If the abstract value is too small, the input value is rounded to
3652 | a subnormal number, and the underflow and inexact exceptions are raised if
3653 | the abstract input cannot be represented exactly as a subnormal double-
3654 | precision floating-point number.
3655 |     The input significand `zSig' has its binary point between bits 62
3656 | and 61, which is 10 bits to the left of the usual location.  This shifted
3657 | significand must be normalized or smaller.  If `zSig' is not normalized,
3658 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3659 | and it must not require rounding.  In the usual case that `zSig' is
3660 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3661 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3662 | Binary Floating-Point Arithmetic.
3663 *----------------------------------------------------------------------------*/
3664 
3665 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3666                                    float_status *status)
3667 {
3668     int8_t roundingMode;
3669     flag roundNearestEven;
3670     int roundIncrement, roundBits;
3671     flag isTiny;
3672 
3673     roundingMode = status->float_rounding_mode;
3674     roundNearestEven = ( roundingMode == float_round_nearest_even );
3675     switch (roundingMode) {
3676     case float_round_nearest_even:
3677     case float_round_ties_away:
3678         roundIncrement = 0x200;
3679         break;
3680     case float_round_to_zero:
3681         roundIncrement = 0;
3682         break;
3683     case float_round_up:
3684         roundIncrement = zSign ? 0 : 0x3ff;
3685         break;
3686     case float_round_down:
3687         roundIncrement = zSign ? 0x3ff : 0;
3688         break;
3689     case float_round_to_odd:
3690         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3691         break;
3692     default:
3693         abort();
3694     }
3695     roundBits = zSig & 0x3FF;
3696     if ( 0x7FD <= (uint16_t) zExp ) {
3697         if (    ( 0x7FD < zExp )
3698              || (    ( zExp == 0x7FD )
3699                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3700            ) {
3701             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3702                                    roundIncrement != 0;
3703             float_raise(float_flag_overflow | float_flag_inexact, status);
3704             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3705         }
3706         if ( zExp < 0 ) {
3707             if (status->flush_to_zero) {
3708                 float_raise(float_flag_output_denormal, status);
3709                 return packFloat64(zSign, 0, 0);
3710             }
3711             isTiny =
3712                    (status->float_detect_tininess
3713                     == float_tininess_before_rounding)
3714                 || ( zExp < -1 )
3715                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3716             shift64RightJamming( zSig, - zExp, &zSig );
3717             zExp = 0;
3718             roundBits = zSig & 0x3FF;
3719             if (isTiny && roundBits) {
3720                 float_raise(float_flag_underflow, status);
3721             }
3722             if (roundingMode == float_round_to_odd) {
3723                 /*
3724                  * For round-to-odd case, the roundIncrement depends on
3725                  * zSig which just changed.
3726                  */
3727                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3728             }
3729         }
3730     }
3731     if (roundBits) {
3732         status->float_exception_flags |= float_flag_inexact;
3733     }
3734     zSig = ( zSig + roundIncrement )>>10;
3735     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3736     if ( zSig == 0 ) zExp = 0;
3737     return packFloat64( zSign, zExp, zSig );
3738 
3739 }
3740 
3741 /*----------------------------------------------------------------------------
3742 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3743 | and significand `zSig', and returns the proper double-precision floating-
3744 | point value corresponding to the abstract input.  This routine is just like
3745 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3746 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3747 | floating-point exponent.
3748 *----------------------------------------------------------------------------*/
3749 
3750 static float64
3751  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3752                               float_status *status)
3753 {
3754     int8_t shiftCount;
3755 
3756     shiftCount = clz64(zSig) - 1;
3757     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3758                                status);
3759 
3760 }
3761 
3762 /*----------------------------------------------------------------------------
3763 | Normalizes the subnormal extended double-precision floating-point value
3764 | represented by the denormalized significand `aSig'.  The normalized exponent
3765 | and significand are stored at the locations pointed to by `zExpPtr' and
3766 | `zSigPtr', respectively.
3767 *----------------------------------------------------------------------------*/
3768 
3769 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3770                                 uint64_t *zSigPtr)
3771 {
3772     int8_t shiftCount;
3773 
3774     shiftCount = clz64(aSig);
3775     *zSigPtr = aSig<<shiftCount;
3776     *zExpPtr = 1 - shiftCount;
3777 }
3778 
3779 /*----------------------------------------------------------------------------
3780 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3781 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3782 | and returns the proper extended double-precision floating-point value
3783 | corresponding to the abstract input.  Ordinarily, the abstract value is
3784 | rounded and packed into the extended double-precision format, with the
3785 | inexact exception raised if the abstract input cannot be represented
3786 | exactly.  However, if the abstract value is too large, the overflow and
3787 | inexact exceptions are raised and an infinity or maximal finite value is
3788 | returned.  If the abstract value is too small, the input value is rounded to
3789 | a subnormal number, and the underflow and inexact exceptions are raised if
3790 | the abstract input cannot be represented exactly as a subnormal extended
3791 | double-precision floating-point number.
3792 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3793 | number of bits as single or double precision, respectively.  Otherwise, the
3794 | result is rounded to the full precision of the extended double-precision
3795 | format.
3796 |     The input significand must be normalized or smaller.  If the input
3797 | significand is not normalized, `zExp' must be 0; in that case, the result
3798 | returned is a subnormal number, and it must not require rounding.  The
3799 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3802 
3803 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3804                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3805                               float_status *status)
3806 {
3807     int8_t roundingMode;
3808     flag roundNearestEven, increment, isTiny;
3809     int64_t roundIncrement, roundMask, roundBits;
3810 
3811     roundingMode = status->float_rounding_mode;
3812     roundNearestEven = ( roundingMode == float_round_nearest_even );
3813     if ( roundingPrecision == 80 ) goto precision80;
3814     if ( roundingPrecision == 64 ) {
3815         roundIncrement = LIT64( 0x0000000000000400 );
3816         roundMask = LIT64( 0x00000000000007FF );
3817     }
3818     else if ( roundingPrecision == 32 ) {
3819         roundIncrement = LIT64( 0x0000008000000000 );
3820         roundMask = LIT64( 0x000000FFFFFFFFFF );
3821     }
3822     else {
3823         goto precision80;
3824     }
3825     zSig0 |= ( zSig1 != 0 );
3826     switch (roundingMode) {
3827     case float_round_nearest_even:
3828     case float_round_ties_away:
3829         break;
3830     case float_round_to_zero:
3831         roundIncrement = 0;
3832         break;
3833     case float_round_up:
3834         roundIncrement = zSign ? 0 : roundMask;
3835         break;
3836     case float_round_down:
3837         roundIncrement = zSign ? roundMask : 0;
3838         break;
3839     default:
3840         abort();
3841     }
3842     roundBits = zSig0 & roundMask;
3843     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3844         if (    ( 0x7FFE < zExp )
3845              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3846            ) {
3847             goto overflow;
3848         }
3849         if ( zExp <= 0 ) {
3850             if (status->flush_to_zero) {
3851                 float_raise(float_flag_output_denormal, status);
3852                 return packFloatx80(zSign, 0, 0);
3853             }
3854             isTiny =
3855                    (status->float_detect_tininess
3856                     == float_tininess_before_rounding)
3857                 || ( zExp < 0 )
3858                 || ( zSig0 <= zSig0 + roundIncrement );
3859             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3860             zExp = 0;
3861             roundBits = zSig0 & roundMask;
3862             if (isTiny && roundBits) {
3863                 float_raise(float_flag_underflow, status);
3864             }
3865             if (roundBits) {
3866                 status->float_exception_flags |= float_flag_inexact;
3867             }
3868             zSig0 += roundIncrement;
3869             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3870             roundIncrement = roundMask + 1;
3871             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3872                 roundMask |= roundIncrement;
3873             }
3874             zSig0 &= ~ roundMask;
3875             return packFloatx80( zSign, zExp, zSig0 );
3876         }
3877     }
3878     if (roundBits) {
3879         status->float_exception_flags |= float_flag_inexact;
3880     }
3881     zSig0 += roundIncrement;
3882     if ( zSig0 < roundIncrement ) {
3883         ++zExp;
3884         zSig0 = LIT64( 0x8000000000000000 );
3885     }
3886     roundIncrement = roundMask + 1;
3887     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3888         roundMask |= roundIncrement;
3889     }
3890     zSig0 &= ~ roundMask;
3891     if ( zSig0 == 0 ) zExp = 0;
3892     return packFloatx80( zSign, zExp, zSig0 );
3893  precision80:
3894     switch (roundingMode) {
3895     case float_round_nearest_even:
3896     case float_round_ties_away:
3897         increment = ((int64_t)zSig1 < 0);
3898         break;
3899     case float_round_to_zero:
3900         increment = 0;
3901         break;
3902     case float_round_up:
3903         increment = !zSign && zSig1;
3904         break;
3905     case float_round_down:
3906         increment = zSign && zSig1;
3907         break;
3908     default:
3909         abort();
3910     }
3911     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3912         if (    ( 0x7FFE < zExp )
3913              || (    ( zExp == 0x7FFE )
3914                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3915                   && increment
3916                 )
3917            ) {
3918             roundMask = 0;
3919  overflow:
3920             float_raise(float_flag_overflow | float_flag_inexact, status);
3921             if (    ( roundingMode == float_round_to_zero )
3922                  || ( zSign && ( roundingMode == float_round_up ) )
3923                  || ( ! zSign && ( roundingMode == float_round_down ) )
3924                ) {
3925                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3926             }
3927             return packFloatx80(zSign,
3928                                 floatx80_infinity_high,
3929                                 floatx80_infinity_low);
3930         }
3931         if ( zExp <= 0 ) {
3932             isTiny =
3933                    (status->float_detect_tininess
3934                     == float_tininess_before_rounding)
3935                 || ( zExp < 0 )
3936                 || ! increment
3937                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3938             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3939             zExp = 0;
3940             if (isTiny && zSig1) {
3941                 float_raise(float_flag_underflow, status);
3942             }
3943             if (zSig1) {
3944                 status->float_exception_flags |= float_flag_inexact;
3945             }
3946             switch (roundingMode) {
3947             case float_round_nearest_even:
3948             case float_round_ties_away:
3949                 increment = ((int64_t)zSig1 < 0);
3950                 break;
3951             case float_round_to_zero:
3952                 increment = 0;
3953                 break;
3954             case float_round_up:
3955                 increment = !zSign && zSig1;
3956                 break;
3957             case float_round_down:
3958                 increment = zSign && zSig1;
3959                 break;
3960             default:
3961                 abort();
3962             }
3963             if ( increment ) {
3964                 ++zSig0;
3965                 zSig0 &=
3966                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3967                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3968             }
3969             return packFloatx80( zSign, zExp, zSig0 );
3970         }
3971     }
3972     if (zSig1) {
3973         status->float_exception_flags |= float_flag_inexact;
3974     }
3975     if ( increment ) {
3976         ++zSig0;
3977         if ( zSig0 == 0 ) {
3978             ++zExp;
3979             zSig0 = LIT64( 0x8000000000000000 );
3980         }
3981         else {
3982             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3983         }
3984     }
3985     else {
3986         if ( zSig0 == 0 ) zExp = 0;
3987     }
3988     return packFloatx80( zSign, zExp, zSig0 );
3989 
3990 }
3991 
3992 /*----------------------------------------------------------------------------
3993 | Takes an abstract floating-point value having sign `zSign', exponent
3994 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3995 | and returns the proper extended double-precision floating-point value
3996 | corresponding to the abstract input.  This routine is just like
3997 | `roundAndPackFloatx80' except that the input significand does not have to be
3998 | normalized.
3999 *----------------------------------------------------------------------------*/
4000 
4001 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4002                                        flag zSign, int32_t zExp,
4003                                        uint64_t zSig0, uint64_t zSig1,
4004                                        float_status *status)
4005 {
4006     int8_t shiftCount;
4007 
4008     if ( zSig0 == 0 ) {
4009         zSig0 = zSig1;
4010         zSig1 = 0;
4011         zExp -= 64;
4012     }
4013     shiftCount = clz64(zSig0);
4014     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4015     zExp -= shiftCount;
4016     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4017                                 zSig0, zSig1, status);
4018 
4019 }
4020 
4021 /*----------------------------------------------------------------------------
4022 | Returns the least-significant 64 fraction bits of the quadruple-precision
4023 | floating-point value `a'.
4024 *----------------------------------------------------------------------------*/
4025 
4026 static inline uint64_t extractFloat128Frac1( float128 a )
4027 {
4028 
4029     return a.low;
4030 
4031 }
4032 
4033 /*----------------------------------------------------------------------------
4034 | Returns the most-significant 48 fraction bits of the quadruple-precision
4035 | floating-point value `a'.
4036 *----------------------------------------------------------------------------*/
4037 
4038 static inline uint64_t extractFloat128Frac0( float128 a )
4039 {
4040 
4041     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4042 
4043 }
4044 
4045 /*----------------------------------------------------------------------------
4046 | Returns the exponent bits of the quadruple-precision floating-point value
4047 | `a'.
4048 *----------------------------------------------------------------------------*/
4049 
4050 static inline int32_t extractFloat128Exp( float128 a )
4051 {
4052 
4053     return ( a.high>>48 ) & 0x7FFF;
4054 
4055 }
4056 
4057 /*----------------------------------------------------------------------------
4058 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4059 *----------------------------------------------------------------------------*/
4060 
4061 static inline flag extractFloat128Sign( float128 a )
4062 {
4063 
4064     return a.high>>63;
4065 
4066 }
4067 
4068 /*----------------------------------------------------------------------------
4069 | Normalizes the subnormal quadruple-precision floating-point value
4070 | represented by the denormalized significand formed by the concatenation of
4071 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4072 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4073 | significand are stored at the location pointed to by `zSig0Ptr', and the
4074 | least significant 64 bits of the normalized significand are stored at the
4075 | location pointed to by `zSig1Ptr'.
4076 *----------------------------------------------------------------------------*/
4077 
4078 static void
4079  normalizeFloat128Subnormal(
4080      uint64_t aSig0,
4081      uint64_t aSig1,
4082      int32_t *zExpPtr,
4083      uint64_t *zSig0Ptr,
4084      uint64_t *zSig1Ptr
4085  )
4086 {
4087     int8_t shiftCount;
4088 
4089     if ( aSig0 == 0 ) {
4090         shiftCount = clz64(aSig1) - 15;
4091         if ( shiftCount < 0 ) {
4092             *zSig0Ptr = aSig1>>( - shiftCount );
4093             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4094         }
4095         else {
4096             *zSig0Ptr = aSig1<<shiftCount;
4097             *zSig1Ptr = 0;
4098         }
4099         *zExpPtr = - shiftCount - 63;
4100     }
4101     else {
4102         shiftCount = clz64(aSig0) - 15;
4103         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4104         *zExpPtr = 1 - shiftCount;
4105     }
4106 
4107 }
4108 
4109 /*----------------------------------------------------------------------------
4110 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4111 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4112 | floating-point value, returning the result.  After being shifted into the
4113 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4114 | added together to form the most significant 32 bits of the result.  This
4115 | means that any integer portion of `zSig0' will be added into the exponent.
4116 | Since a properly normalized significand will have an integer portion equal
4117 | to 1, the `zExp' input should be 1 less than the desired result exponent
4118 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4119 | significand.
4120 *----------------------------------------------------------------------------*/
4121 
4122 static inline float128
4123  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4124 {
4125     float128 z;
4126 
4127     z.low = zSig1;
4128     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4129     return z;
4130 
4131 }
4132 
4133 /*----------------------------------------------------------------------------
4134 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4135 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4136 | and `zSig2', and returns the proper quadruple-precision floating-point value
4137 | corresponding to the abstract input.  Ordinarily, the abstract value is
4138 | simply rounded and packed into the quadruple-precision format, with the
4139 | inexact exception raised if the abstract input cannot be represented
4140 | exactly.  However, if the abstract value is too large, the overflow and
4141 | inexact exceptions are raised and an infinity or maximal finite value is
4142 | returned.  If the abstract value is too small, the input value is rounded to
4143 | a subnormal number, and the underflow and inexact exceptions are raised if
4144 | the abstract input cannot be represented exactly as a subnormal quadruple-
4145 | precision floating-point number.
4146 |     The input significand must be normalized or smaller.  If the input
4147 | significand is not normalized, `zExp' must be 0; in that case, the result
4148 | returned is a subnormal number, and it must not require rounding.  In the
4149 | usual case that the input significand is normalized, `zExp' must be 1 less
4150 | than the ``true'' floating-point exponent.  The handling of underflow and
4151 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4152 *----------------------------------------------------------------------------*/
4153 
4154 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4155                                      uint64_t zSig0, uint64_t zSig1,
4156                                      uint64_t zSig2, float_status *status)
4157 {
4158     int8_t roundingMode;
4159     flag roundNearestEven, increment, isTiny;
4160 
4161     roundingMode = status->float_rounding_mode;
4162     roundNearestEven = ( roundingMode == float_round_nearest_even );
4163     switch (roundingMode) {
4164     case float_round_nearest_even:
4165     case float_round_ties_away:
4166         increment = ((int64_t)zSig2 < 0);
4167         break;
4168     case float_round_to_zero:
4169         increment = 0;
4170         break;
4171     case float_round_up:
4172         increment = !zSign && zSig2;
4173         break;
4174     case float_round_down:
4175         increment = zSign && zSig2;
4176         break;
4177     case float_round_to_odd:
4178         increment = !(zSig1 & 0x1) && zSig2;
4179         break;
4180     default:
4181         abort();
4182     }
4183     if ( 0x7FFD <= (uint32_t) zExp ) {
4184         if (    ( 0x7FFD < zExp )
4185              || (    ( zExp == 0x7FFD )
4186                   && eq128(
4187                          LIT64( 0x0001FFFFFFFFFFFF ),
4188                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4189                          zSig0,
4190                          zSig1
4191                      )
4192                   && increment
4193                 )
4194            ) {
4195             float_raise(float_flag_overflow | float_flag_inexact, status);
4196             if (    ( roundingMode == float_round_to_zero )
4197                  || ( zSign && ( roundingMode == float_round_up ) )
4198                  || ( ! zSign && ( roundingMode == float_round_down ) )
4199                  || (roundingMode == float_round_to_odd)
4200                ) {
4201                 return
4202                     packFloat128(
4203                         zSign,
4204                         0x7FFE,
4205                         LIT64( 0x0000FFFFFFFFFFFF ),
4206                         LIT64( 0xFFFFFFFFFFFFFFFF )
4207                     );
4208             }
4209             return packFloat128( zSign, 0x7FFF, 0, 0 );
4210         }
4211         if ( zExp < 0 ) {
4212             if (status->flush_to_zero) {
4213                 float_raise(float_flag_output_denormal, status);
4214                 return packFloat128(zSign, 0, 0, 0);
4215             }
4216             isTiny =
4217                    (status->float_detect_tininess
4218                     == float_tininess_before_rounding)
4219                 || ( zExp < -1 )
4220                 || ! increment
4221                 || lt128(
4222                        zSig0,
4223                        zSig1,
4224                        LIT64( 0x0001FFFFFFFFFFFF ),
4225                        LIT64( 0xFFFFFFFFFFFFFFFF )
4226                    );
4227             shift128ExtraRightJamming(
4228                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4229             zExp = 0;
4230             if (isTiny && zSig2) {
4231                 float_raise(float_flag_underflow, status);
4232             }
4233             switch (roundingMode) {
4234             case float_round_nearest_even:
4235             case float_round_ties_away:
4236                 increment = ((int64_t)zSig2 < 0);
4237                 break;
4238             case float_round_to_zero:
4239                 increment = 0;
4240                 break;
4241             case float_round_up:
4242                 increment = !zSign && zSig2;
4243                 break;
4244             case float_round_down:
4245                 increment = zSign && zSig2;
4246                 break;
4247             case float_round_to_odd:
4248                 increment = !(zSig1 & 0x1) && zSig2;
4249                 break;
4250             default:
4251                 abort();
4252             }
4253         }
4254     }
4255     if (zSig2) {
4256         status->float_exception_flags |= float_flag_inexact;
4257     }
4258     if ( increment ) {
4259         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4260         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4261     }
4262     else {
4263         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4264     }
4265     return packFloat128( zSign, zExp, zSig0, zSig1 );
4266 
4267 }
4268 
4269 /*----------------------------------------------------------------------------
4270 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4271 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4272 | returns the proper quadruple-precision floating-point value corresponding
4273 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4274 | except that the input significand has fewer bits and does not have to be
4275 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4276 | point exponent.
4277 *----------------------------------------------------------------------------*/
4278 
4279 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4280                                               uint64_t zSig0, uint64_t zSig1,
4281                                               float_status *status)
4282 {
4283     int8_t shiftCount;
4284     uint64_t zSig2;
4285 
4286     if ( zSig0 == 0 ) {
4287         zSig0 = zSig1;
4288         zSig1 = 0;
4289         zExp -= 64;
4290     }
4291     shiftCount = clz64(zSig0) - 15;
4292     if ( 0 <= shiftCount ) {
4293         zSig2 = 0;
4294         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4295     }
4296     else {
4297         shift128ExtraRightJamming(
4298             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4299     }
4300     zExp -= shiftCount;
4301     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4302 
4303 }
4304 
4305 
4306 /*----------------------------------------------------------------------------
4307 | Returns the result of converting the 32-bit two's complement integer `a'
4308 | to the extended double-precision floating-point format.  The conversion
4309 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4310 | Arithmetic.
4311 *----------------------------------------------------------------------------*/
4312 
4313 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4314 {
4315     flag zSign;
4316     uint32_t absA;
4317     int8_t shiftCount;
4318     uint64_t zSig;
4319 
4320     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4321     zSign = ( a < 0 );
4322     absA = zSign ? - a : a;
4323     shiftCount = clz32(absA) + 32;
4324     zSig = absA;
4325     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4326 
4327 }
4328 
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the 32-bit two's complement integer `a' to
4331 | the quadruple-precision floating-point format.  The conversion is performed
4332 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4333 *----------------------------------------------------------------------------*/
4334 
4335 float128 int32_to_float128(int32_t a, float_status *status)
4336 {
4337     flag zSign;
4338     uint32_t absA;
4339     int8_t shiftCount;
4340     uint64_t zSig0;
4341 
4342     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4343     zSign = ( a < 0 );
4344     absA = zSign ? - a : a;
4345     shiftCount = clz32(absA) + 17;
4346     zSig0 = absA;
4347     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4348 
4349 }
4350 
4351 /*----------------------------------------------------------------------------
4352 | Returns the result of converting the 64-bit two's complement integer `a'
4353 | to the extended double-precision floating-point format.  The conversion
4354 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4355 | Arithmetic.
4356 *----------------------------------------------------------------------------*/
4357 
4358 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4359 {
4360     flag zSign;
4361     uint64_t absA;
4362     int8_t shiftCount;
4363 
4364     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4365     zSign = ( a < 0 );
4366     absA = zSign ? - a : a;
4367     shiftCount = clz64(absA);
4368     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4369 
4370 }
4371 
4372 /*----------------------------------------------------------------------------
4373 | Returns the result of converting the 64-bit two's complement integer `a' to
4374 | the quadruple-precision floating-point format.  The conversion is performed
4375 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4376 *----------------------------------------------------------------------------*/
4377 
4378 float128 int64_to_float128(int64_t a, float_status *status)
4379 {
4380     flag zSign;
4381     uint64_t absA;
4382     int8_t shiftCount;
4383     int32_t zExp;
4384     uint64_t zSig0, zSig1;
4385 
4386     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4387     zSign = ( a < 0 );
4388     absA = zSign ? - a : a;
4389     shiftCount = clz64(absA) + 49;
4390     zExp = 0x406E - shiftCount;
4391     if ( 64 <= shiftCount ) {
4392         zSig1 = 0;
4393         zSig0 = absA;
4394         shiftCount -= 64;
4395     }
4396     else {
4397         zSig1 = absA;
4398         zSig0 = 0;
4399     }
4400     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4401     return packFloat128( zSign, zExp, zSig0, zSig1 );
4402 
4403 }
4404 
4405 /*----------------------------------------------------------------------------
4406 | Returns the result of converting the 64-bit unsigned integer `a'
4407 | to the quadruple-precision floating-point format.  The conversion is performed
4408 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4409 *----------------------------------------------------------------------------*/
4410 
4411 float128 uint64_to_float128(uint64_t a, float_status *status)
4412 {
4413     if (a == 0) {
4414         return float128_zero;
4415     }
4416     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4417 }
4418 
4419 /*----------------------------------------------------------------------------
4420 | Returns the result of converting the single-precision floating-point value
4421 | `a' to the extended double-precision floating-point format.  The conversion
4422 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4423 | Arithmetic.
4424 *----------------------------------------------------------------------------*/
4425 
4426 floatx80 float32_to_floatx80(float32 a, float_status *status)
4427 {
4428     flag aSign;
4429     int aExp;
4430     uint32_t aSig;
4431 
4432     a = float32_squash_input_denormal(a, status);
4433     aSig = extractFloat32Frac( a );
4434     aExp = extractFloat32Exp( a );
4435     aSign = extractFloat32Sign( a );
4436     if ( aExp == 0xFF ) {
4437         if (aSig) {
4438             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4439         }
4440         return packFloatx80(aSign,
4441                             floatx80_infinity_high,
4442                             floatx80_infinity_low);
4443     }
4444     if ( aExp == 0 ) {
4445         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4446         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4447     }
4448     aSig |= 0x00800000;
4449     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4450 
4451 }
4452 
4453 /*----------------------------------------------------------------------------
4454 | Returns the result of converting the single-precision floating-point value
4455 | `a' to the double-precision floating-point format.  The conversion is
4456 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4457 | Arithmetic.
4458 *----------------------------------------------------------------------------*/
4459 
4460 float128 float32_to_float128(float32 a, float_status *status)
4461 {
4462     flag aSign;
4463     int aExp;
4464     uint32_t aSig;
4465 
4466     a = float32_squash_input_denormal(a, status);
4467     aSig = extractFloat32Frac( a );
4468     aExp = extractFloat32Exp( a );
4469     aSign = extractFloat32Sign( a );
4470     if ( aExp == 0xFF ) {
4471         if (aSig) {
4472             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4473         }
4474         return packFloat128( aSign, 0x7FFF, 0, 0 );
4475     }
4476     if ( aExp == 0 ) {
4477         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4478         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4479         --aExp;
4480     }
4481     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4482 
4483 }
4484 
4485 /*----------------------------------------------------------------------------
4486 | Returns the remainder of the single-precision floating-point value `a'
4487 | with respect to the corresponding value `b'.  The operation is performed
4488 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4489 *----------------------------------------------------------------------------*/
4490 
4491 float32 float32_rem(float32 a, float32 b, float_status *status)
4492 {
4493     flag aSign, zSign;
4494     int aExp, bExp, expDiff;
4495     uint32_t aSig, bSig;
4496     uint32_t q;
4497     uint64_t aSig64, bSig64, q64;
4498     uint32_t alternateASig;
4499     int32_t sigMean;
4500     a = float32_squash_input_denormal(a, status);
4501     b = float32_squash_input_denormal(b, status);
4502 
4503     aSig = extractFloat32Frac( a );
4504     aExp = extractFloat32Exp( a );
4505     aSign = extractFloat32Sign( a );
4506     bSig = extractFloat32Frac( b );
4507     bExp = extractFloat32Exp( b );
4508     if ( aExp == 0xFF ) {
4509         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4510             return propagateFloat32NaN(a, b, status);
4511         }
4512         float_raise(float_flag_invalid, status);
4513         return float32_default_nan(status);
4514     }
4515     if ( bExp == 0xFF ) {
4516         if (bSig) {
4517             return propagateFloat32NaN(a, b, status);
4518         }
4519         return a;
4520     }
4521     if ( bExp == 0 ) {
4522         if ( bSig == 0 ) {
4523             float_raise(float_flag_invalid, status);
4524             return float32_default_nan(status);
4525         }
4526         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4527     }
4528     if ( aExp == 0 ) {
4529         if ( aSig == 0 ) return a;
4530         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4531     }
4532     expDiff = aExp - bExp;
4533     aSig |= 0x00800000;
4534     bSig |= 0x00800000;
4535     if ( expDiff < 32 ) {
4536         aSig <<= 8;
4537         bSig <<= 8;
4538         if ( expDiff < 0 ) {
4539             if ( expDiff < -1 ) return a;
4540             aSig >>= 1;
4541         }
4542         q = ( bSig <= aSig );
4543         if ( q ) aSig -= bSig;
4544         if ( 0 < expDiff ) {
4545             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4546             q >>= 32 - expDiff;
4547             bSig >>= 2;
4548             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4549         }
4550         else {
4551             aSig >>= 2;
4552             bSig >>= 2;
4553         }
4554     }
4555     else {
4556         if ( bSig <= aSig ) aSig -= bSig;
4557         aSig64 = ( (uint64_t) aSig )<<40;
4558         bSig64 = ( (uint64_t) bSig )<<40;
4559         expDiff -= 64;
4560         while ( 0 < expDiff ) {
4561             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4562             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4563             aSig64 = - ( ( bSig * q64 )<<38 );
4564             expDiff -= 62;
4565         }
4566         expDiff += 64;
4567         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4568         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4569         q = q64>>( 64 - expDiff );
4570         bSig <<= 6;
4571         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4572     }
4573     do {
4574         alternateASig = aSig;
4575         ++q;
4576         aSig -= bSig;
4577     } while ( 0 <= (int32_t) aSig );
4578     sigMean = aSig + alternateASig;
4579     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4580         aSig = alternateASig;
4581     }
4582     zSign = ( (int32_t) aSig < 0 );
4583     if ( zSign ) aSig = - aSig;
4584     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4585 }
4586 
4587 
4588 
4589 /*----------------------------------------------------------------------------
4590 | Returns the binary exponential of the single-precision floating-point value
4591 | `a'. The operation is performed according to the IEC/IEEE Standard for
4592 | Binary Floating-Point Arithmetic.
4593 |
4594 | Uses the following identities:
4595 |
4596 | 1. -------------------------------------------------------------------------
4597 |      x    x*ln(2)
4598 |     2  = e
4599 |
4600 | 2. -------------------------------------------------------------------------
4601 |                      2     3     4     5           n
4602 |      x        x     x     x     x     x           x
4603 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4604 |               1!    2!    3!    4!    5!          n!
4605 *----------------------------------------------------------------------------*/
4606 
4607 static const float64 float32_exp2_coefficients[15] =
4608 {
4609     const_float64( 0x3ff0000000000000ll ), /*  1 */
4610     const_float64( 0x3fe0000000000000ll ), /*  2 */
4611     const_float64( 0x3fc5555555555555ll ), /*  3 */
4612     const_float64( 0x3fa5555555555555ll ), /*  4 */
4613     const_float64( 0x3f81111111111111ll ), /*  5 */
4614     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4615     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4616     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4617     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4618     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4619     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4620     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4621     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4622     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4623     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4624 };
4625 
4626 float32 float32_exp2(float32 a, float_status *status)
4627 {
4628     flag aSign;
4629     int aExp;
4630     uint32_t aSig;
4631     float64 r, x, xn;
4632     int i;
4633     a = float32_squash_input_denormal(a, status);
4634 
4635     aSig = extractFloat32Frac( a );
4636     aExp = extractFloat32Exp( a );
4637     aSign = extractFloat32Sign( a );
4638 
4639     if ( aExp == 0xFF) {
4640         if (aSig) {
4641             return propagateFloat32NaN(a, float32_zero, status);
4642         }
4643         return (aSign) ? float32_zero : a;
4644     }
4645     if (aExp == 0) {
4646         if (aSig == 0) return float32_one;
4647     }
4648 
4649     float_raise(float_flag_inexact, status);
4650 
4651     /* ******************************* */
4652     /* using float64 for approximation */
4653     /* ******************************* */
4654     x = float32_to_float64(a, status);
4655     x = float64_mul(x, float64_ln2, status);
4656 
4657     xn = x;
4658     r = float64_one;
4659     for (i = 0 ; i < 15 ; i++) {
4660         float64 f;
4661 
4662         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4663         r = float64_add(r, f, status);
4664 
4665         xn = float64_mul(xn, x, status);
4666     }
4667 
4668     return float64_to_float32(r, status);
4669 }
4670 
4671 /*----------------------------------------------------------------------------
4672 | Returns the binary log of the single-precision floating-point value `a'.
4673 | The operation is performed according to the IEC/IEEE Standard for Binary
4674 | Floating-Point Arithmetic.
4675 *----------------------------------------------------------------------------*/
4676 float32 float32_log2(float32 a, float_status *status)
4677 {
4678     flag aSign, zSign;
4679     int aExp;
4680     uint32_t aSig, zSig, i;
4681 
4682     a = float32_squash_input_denormal(a, status);
4683     aSig = extractFloat32Frac( a );
4684     aExp = extractFloat32Exp( a );
4685     aSign = extractFloat32Sign( a );
4686 
4687     if ( aExp == 0 ) {
4688         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4689         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4690     }
4691     if ( aSign ) {
4692         float_raise(float_flag_invalid, status);
4693         return float32_default_nan(status);
4694     }
4695     if ( aExp == 0xFF ) {
4696         if (aSig) {
4697             return propagateFloat32NaN(a, float32_zero, status);
4698         }
4699         return a;
4700     }
4701 
4702     aExp -= 0x7F;
4703     aSig |= 0x00800000;
4704     zSign = aExp < 0;
4705     zSig = aExp << 23;
4706 
4707     for (i = 1 << 22; i > 0; i >>= 1) {
4708         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4709         if ( aSig & 0x01000000 ) {
4710             aSig >>= 1;
4711             zSig |= i;
4712         }
4713     }
4714 
4715     if ( zSign )
4716         zSig = -zSig;
4717 
4718     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4719 }
4720 
4721 /*----------------------------------------------------------------------------
4722 | Returns 1 if the single-precision floating-point value `a' is equal to
4723 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4724 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4725 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4726 *----------------------------------------------------------------------------*/
4727 
4728 int float32_eq(float32 a, float32 b, float_status *status)
4729 {
4730     uint32_t av, bv;
4731     a = float32_squash_input_denormal(a, status);
4732     b = float32_squash_input_denormal(b, status);
4733 
4734     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4735          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4736        ) {
4737         float_raise(float_flag_invalid, status);
4738         return 0;
4739     }
4740     av = float32_val(a);
4741     bv = float32_val(b);
4742     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4743 }
4744 
4745 /*----------------------------------------------------------------------------
4746 | Returns 1 if the single-precision floating-point value `a' is less than
4747 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4748 | exception is raised if either operand is a NaN.  The comparison is performed
4749 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4751 
4752 int float32_le(float32 a, float32 b, float_status *status)
4753 {
4754     flag aSign, bSign;
4755     uint32_t av, bv;
4756     a = float32_squash_input_denormal(a, status);
4757     b = float32_squash_input_denormal(b, status);
4758 
4759     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4760          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4761        ) {
4762         float_raise(float_flag_invalid, status);
4763         return 0;
4764     }
4765     aSign = extractFloat32Sign( a );
4766     bSign = extractFloat32Sign( b );
4767     av = float32_val(a);
4768     bv = float32_val(b);
4769     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4770     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4771 
4772 }
4773 
4774 /*----------------------------------------------------------------------------
4775 | Returns 1 if the single-precision floating-point value `a' is less than
4776 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4777 | raised if either operand is a NaN.  The comparison is performed according
4778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4779 *----------------------------------------------------------------------------*/
4780 
4781 int float32_lt(float32 a, float32 b, float_status *status)
4782 {
4783     flag aSign, bSign;
4784     uint32_t av, bv;
4785     a = float32_squash_input_denormal(a, status);
4786     b = float32_squash_input_denormal(b, status);
4787 
4788     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4789          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4790        ) {
4791         float_raise(float_flag_invalid, status);
4792         return 0;
4793     }
4794     aSign = extractFloat32Sign( a );
4795     bSign = extractFloat32Sign( b );
4796     av = float32_val(a);
4797     bv = float32_val(b);
4798     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4799     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4800 
4801 }
4802 
4803 /*----------------------------------------------------------------------------
4804 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4805 | be compared, and 0 otherwise.  The invalid exception is raised if either
4806 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4807 | Standard for Binary Floating-Point Arithmetic.
4808 *----------------------------------------------------------------------------*/
4809 
4810 int float32_unordered(float32 a, float32 b, float_status *status)
4811 {
4812     a = float32_squash_input_denormal(a, status);
4813     b = float32_squash_input_denormal(b, status);
4814 
4815     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4816          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4817        ) {
4818         float_raise(float_flag_invalid, status);
4819         return 1;
4820     }
4821     return 0;
4822 }
4823 
4824 /*----------------------------------------------------------------------------
4825 | Returns 1 if the single-precision floating-point value `a' is equal to
4826 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4827 | exception.  The comparison is performed according to the IEC/IEEE Standard
4828 | for Binary Floating-Point Arithmetic.
4829 *----------------------------------------------------------------------------*/
4830 
4831 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4832 {
4833     a = float32_squash_input_denormal(a, status);
4834     b = float32_squash_input_denormal(b, status);
4835 
4836     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4837          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4838        ) {
4839         if (float32_is_signaling_nan(a, status)
4840          || float32_is_signaling_nan(b, status)) {
4841             float_raise(float_flag_invalid, status);
4842         }
4843         return 0;
4844     }
4845     return ( float32_val(a) == float32_val(b) ) ||
4846             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4847 }
4848 
4849 /*----------------------------------------------------------------------------
4850 | Returns 1 if the single-precision floating-point value `a' is less than or
4851 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4852 | cause an exception.  Otherwise, the comparison is performed according to the
4853 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4854 *----------------------------------------------------------------------------*/
4855 
4856 int float32_le_quiet(float32 a, float32 b, float_status *status)
4857 {
4858     flag aSign, bSign;
4859     uint32_t av, bv;
4860     a = float32_squash_input_denormal(a, status);
4861     b = float32_squash_input_denormal(b, status);
4862 
4863     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4864          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4865        ) {
4866         if (float32_is_signaling_nan(a, status)
4867          || float32_is_signaling_nan(b, status)) {
4868             float_raise(float_flag_invalid, status);
4869         }
4870         return 0;
4871     }
4872     aSign = extractFloat32Sign( a );
4873     bSign = extractFloat32Sign( b );
4874     av = float32_val(a);
4875     bv = float32_val(b);
4876     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4877     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4878 
4879 }
4880 
4881 /*----------------------------------------------------------------------------
4882 | Returns 1 if the single-precision floating-point value `a' is less than
4883 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4884 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4885 | Standard for Binary Floating-Point Arithmetic.
4886 *----------------------------------------------------------------------------*/
4887 
4888 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4889 {
4890     flag aSign, bSign;
4891     uint32_t av, bv;
4892     a = float32_squash_input_denormal(a, status);
4893     b = float32_squash_input_denormal(b, status);
4894 
4895     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4896          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4897        ) {
4898         if (float32_is_signaling_nan(a, status)
4899          || float32_is_signaling_nan(b, status)) {
4900             float_raise(float_flag_invalid, status);
4901         }
4902         return 0;
4903     }
4904     aSign = extractFloat32Sign( a );
4905     bSign = extractFloat32Sign( b );
4906     av = float32_val(a);
4907     bv = float32_val(b);
4908     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4909     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4910 
4911 }
4912 
4913 /*----------------------------------------------------------------------------
4914 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4915 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4916 | comparison is performed according to the IEC/IEEE Standard for Binary
4917 | Floating-Point Arithmetic.
4918 *----------------------------------------------------------------------------*/
4919 
4920 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4921 {
4922     a = float32_squash_input_denormal(a, status);
4923     b = float32_squash_input_denormal(b, status);
4924 
4925     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4926          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4927        ) {
4928         if (float32_is_signaling_nan(a, status)
4929          || float32_is_signaling_nan(b, status)) {
4930             float_raise(float_flag_invalid, status);
4931         }
4932         return 1;
4933     }
4934     return 0;
4935 }
4936 
4937 /*----------------------------------------------------------------------------
4938 | If `a' is denormal and we are in flush-to-zero mode then set the
4939 | input-denormal exception and return zero. Otherwise just return the value.
4940 *----------------------------------------------------------------------------*/
4941 float16 float16_squash_input_denormal(float16 a, float_status *status)
4942 {
4943     if (status->flush_inputs_to_zero) {
4944         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4945             float_raise(float_flag_input_denormal, status);
4946             return make_float16(float16_val(a) & 0x8000);
4947         }
4948     }
4949     return a;
4950 }
4951 
4952 /*----------------------------------------------------------------------------
4953 | Returns the result of converting the double-precision floating-point value
4954 | `a' to the extended double-precision floating-point format.  The conversion
4955 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4956 | Arithmetic.
4957 *----------------------------------------------------------------------------*/
4958 
4959 floatx80 float64_to_floatx80(float64 a, float_status *status)
4960 {
4961     flag aSign;
4962     int aExp;
4963     uint64_t aSig;
4964 
4965     a = float64_squash_input_denormal(a, status);
4966     aSig = extractFloat64Frac( a );
4967     aExp = extractFloat64Exp( a );
4968     aSign = extractFloat64Sign( a );
4969     if ( aExp == 0x7FF ) {
4970         if (aSig) {
4971             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4972         }
4973         return packFloatx80(aSign,
4974                             floatx80_infinity_high,
4975                             floatx80_infinity_low);
4976     }
4977     if ( aExp == 0 ) {
4978         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4979         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4980     }
4981     return
4982         packFloatx80(
4983             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4984 
4985 }
4986 
4987 /*----------------------------------------------------------------------------
4988 | Returns the result of converting the double-precision floating-point value
4989 | `a' to the quadruple-precision floating-point format.  The conversion is
4990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4991 | Arithmetic.
4992 *----------------------------------------------------------------------------*/
4993 
4994 float128 float64_to_float128(float64 a, float_status *status)
4995 {
4996     flag aSign;
4997     int aExp;
4998     uint64_t aSig, zSig0, zSig1;
4999 
5000     a = float64_squash_input_denormal(a, status);
5001     aSig = extractFloat64Frac( a );
5002     aExp = extractFloat64Exp( a );
5003     aSign = extractFloat64Sign( a );
5004     if ( aExp == 0x7FF ) {
5005         if (aSig) {
5006             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5007         }
5008         return packFloat128( aSign, 0x7FFF, 0, 0 );
5009     }
5010     if ( aExp == 0 ) {
5011         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5012         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5013         --aExp;
5014     }
5015     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5016     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5017 
5018 }
5019 
5020 
5021 /*----------------------------------------------------------------------------
5022 | Returns the remainder of the double-precision floating-point value `a'
5023 | with respect to the corresponding value `b'.  The operation is performed
5024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5025 *----------------------------------------------------------------------------*/
5026 
5027 float64 float64_rem(float64 a, float64 b, float_status *status)
5028 {
5029     flag aSign, zSign;
5030     int aExp, bExp, expDiff;
5031     uint64_t aSig, bSig;
5032     uint64_t q, alternateASig;
5033     int64_t sigMean;
5034 
5035     a = float64_squash_input_denormal(a, status);
5036     b = float64_squash_input_denormal(b, status);
5037     aSig = extractFloat64Frac( a );
5038     aExp = extractFloat64Exp( a );
5039     aSign = extractFloat64Sign( a );
5040     bSig = extractFloat64Frac( b );
5041     bExp = extractFloat64Exp( b );
5042     if ( aExp == 0x7FF ) {
5043         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5044             return propagateFloat64NaN(a, b, status);
5045         }
5046         float_raise(float_flag_invalid, status);
5047         return float64_default_nan(status);
5048     }
5049     if ( bExp == 0x7FF ) {
5050         if (bSig) {
5051             return propagateFloat64NaN(a, b, status);
5052         }
5053         return a;
5054     }
5055     if ( bExp == 0 ) {
5056         if ( bSig == 0 ) {
5057             float_raise(float_flag_invalid, status);
5058             return float64_default_nan(status);
5059         }
5060         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5061     }
5062     if ( aExp == 0 ) {
5063         if ( aSig == 0 ) return a;
5064         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5065     }
5066     expDiff = aExp - bExp;
5067     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5068     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5069     if ( expDiff < 0 ) {
5070         if ( expDiff < -1 ) return a;
5071         aSig >>= 1;
5072     }
5073     q = ( bSig <= aSig );
5074     if ( q ) aSig -= bSig;
5075     expDiff -= 64;
5076     while ( 0 < expDiff ) {
5077         q = estimateDiv128To64( aSig, 0, bSig );
5078         q = ( 2 < q ) ? q - 2 : 0;
5079         aSig = - ( ( bSig>>2 ) * q );
5080         expDiff -= 62;
5081     }
5082     expDiff += 64;
5083     if ( 0 < expDiff ) {
5084         q = estimateDiv128To64( aSig, 0, bSig );
5085         q = ( 2 < q ) ? q - 2 : 0;
5086         q >>= 64 - expDiff;
5087         bSig >>= 2;
5088         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5089     }
5090     else {
5091         aSig >>= 2;
5092         bSig >>= 2;
5093     }
5094     do {
5095         alternateASig = aSig;
5096         ++q;
5097         aSig -= bSig;
5098     } while ( 0 <= (int64_t) aSig );
5099     sigMean = aSig + alternateASig;
5100     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5101         aSig = alternateASig;
5102     }
5103     zSign = ( (int64_t) aSig < 0 );
5104     if ( zSign ) aSig = - aSig;
5105     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5106 
5107 }
5108 
5109 /*----------------------------------------------------------------------------
5110 | Returns the binary log of the double-precision floating-point value `a'.
5111 | The operation is performed according to the IEC/IEEE Standard for Binary
5112 | Floating-Point Arithmetic.
5113 *----------------------------------------------------------------------------*/
5114 float64 float64_log2(float64 a, float_status *status)
5115 {
5116     flag aSign, zSign;
5117     int aExp;
5118     uint64_t aSig, aSig0, aSig1, zSig, i;
5119     a = float64_squash_input_denormal(a, status);
5120 
5121     aSig = extractFloat64Frac( a );
5122     aExp = extractFloat64Exp( a );
5123     aSign = extractFloat64Sign( a );
5124 
5125     if ( aExp == 0 ) {
5126         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5127         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5128     }
5129     if ( aSign ) {
5130         float_raise(float_flag_invalid, status);
5131         return float64_default_nan(status);
5132     }
5133     if ( aExp == 0x7FF ) {
5134         if (aSig) {
5135             return propagateFloat64NaN(a, float64_zero, status);
5136         }
5137         return a;
5138     }
5139 
5140     aExp -= 0x3FF;
5141     aSig |= LIT64( 0x0010000000000000 );
5142     zSign = aExp < 0;
5143     zSig = (uint64_t)aExp << 52;
5144     for (i = 1LL << 51; i > 0; i >>= 1) {
5145         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5146         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5147         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5148             aSig >>= 1;
5149             zSig |= i;
5150         }
5151     }
5152 
5153     if ( zSign )
5154         zSig = -zSig;
5155     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5156 }
5157 
5158 /*----------------------------------------------------------------------------
5159 | Returns 1 if the double-precision floating-point value `a' is equal to the
5160 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5161 | if either operand is a NaN.  Otherwise, the comparison is performed
5162 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5163 *----------------------------------------------------------------------------*/
5164 
5165 int float64_eq(float64 a, float64 b, float_status *status)
5166 {
5167     uint64_t av, bv;
5168     a = float64_squash_input_denormal(a, status);
5169     b = float64_squash_input_denormal(b, status);
5170 
5171     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5172          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5173        ) {
5174         float_raise(float_flag_invalid, status);
5175         return 0;
5176     }
5177     av = float64_val(a);
5178     bv = float64_val(b);
5179     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5180 
5181 }
5182 
5183 /*----------------------------------------------------------------------------
5184 | Returns 1 if the double-precision floating-point value `a' is less than or
5185 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5186 | exception is raised if either operand is a NaN.  The comparison is performed
5187 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5188 *----------------------------------------------------------------------------*/
5189 
5190 int float64_le(float64 a, float64 b, float_status *status)
5191 {
5192     flag aSign, bSign;
5193     uint64_t av, bv;
5194     a = float64_squash_input_denormal(a, status);
5195     b = float64_squash_input_denormal(b, status);
5196 
5197     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5198          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5199        ) {
5200         float_raise(float_flag_invalid, status);
5201         return 0;
5202     }
5203     aSign = extractFloat64Sign( a );
5204     bSign = extractFloat64Sign( b );
5205     av = float64_val(a);
5206     bv = float64_val(b);
5207     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5208     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5209 
5210 }
5211 
5212 /*----------------------------------------------------------------------------
5213 | Returns 1 if the double-precision floating-point value `a' is less than
5214 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5215 | raised if either operand is a NaN.  The comparison is performed according
5216 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5217 *----------------------------------------------------------------------------*/
5218 
5219 int float64_lt(float64 a, float64 b, float_status *status)
5220 {
5221     flag aSign, bSign;
5222     uint64_t av, bv;
5223 
5224     a = float64_squash_input_denormal(a, status);
5225     b = float64_squash_input_denormal(b, status);
5226     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5227          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5228        ) {
5229         float_raise(float_flag_invalid, status);
5230         return 0;
5231     }
5232     aSign = extractFloat64Sign( a );
5233     bSign = extractFloat64Sign( b );
5234     av = float64_val(a);
5235     bv = float64_val(b);
5236     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5237     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5238 
5239 }
5240 
5241 /*----------------------------------------------------------------------------
5242 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5243 | be compared, and 0 otherwise.  The invalid exception is raised if either
5244 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5245 | Standard for Binary Floating-Point Arithmetic.
5246 *----------------------------------------------------------------------------*/
5247 
5248 int float64_unordered(float64 a, float64 b, float_status *status)
5249 {
5250     a = float64_squash_input_denormal(a, status);
5251     b = float64_squash_input_denormal(b, status);
5252 
5253     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5254          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5255        ) {
5256         float_raise(float_flag_invalid, status);
5257         return 1;
5258     }
5259     return 0;
5260 }
5261 
5262 /*----------------------------------------------------------------------------
5263 | Returns 1 if the double-precision floating-point value `a' is equal to the
5264 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5265 | exception.The comparison is performed according to the IEC/IEEE Standard
5266 | for Binary Floating-Point Arithmetic.
5267 *----------------------------------------------------------------------------*/
5268 
5269 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5270 {
5271     uint64_t av, bv;
5272     a = float64_squash_input_denormal(a, status);
5273     b = float64_squash_input_denormal(b, status);
5274 
5275     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5276          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5277        ) {
5278         if (float64_is_signaling_nan(a, status)
5279          || float64_is_signaling_nan(b, status)) {
5280             float_raise(float_flag_invalid, status);
5281         }
5282         return 0;
5283     }
5284     av = float64_val(a);
5285     bv = float64_val(b);
5286     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5287 
5288 }
5289 
5290 /*----------------------------------------------------------------------------
5291 | Returns 1 if the double-precision floating-point value `a' is less than or
5292 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5293 | cause an exception.  Otherwise, the comparison is performed according to the
5294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296 
5297 int float64_le_quiet(float64 a, float64 b, float_status *status)
5298 {
5299     flag aSign, bSign;
5300     uint64_t av, bv;
5301     a = float64_squash_input_denormal(a, status);
5302     b = float64_squash_input_denormal(b, status);
5303 
5304     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5305          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5306        ) {
5307         if (float64_is_signaling_nan(a, status)
5308          || float64_is_signaling_nan(b, status)) {
5309             float_raise(float_flag_invalid, status);
5310         }
5311         return 0;
5312     }
5313     aSign = extractFloat64Sign( a );
5314     bSign = extractFloat64Sign( b );
5315     av = float64_val(a);
5316     bv = float64_val(b);
5317     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5318     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5319 
5320 }
5321 
5322 /*----------------------------------------------------------------------------
5323 | Returns 1 if the double-precision floating-point value `a' is less than
5324 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5325 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5326 | Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 
5329 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5330 {
5331     flag aSign, bSign;
5332     uint64_t av, bv;
5333     a = float64_squash_input_denormal(a, status);
5334     b = float64_squash_input_denormal(b, status);
5335 
5336     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5337          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5338        ) {
5339         if (float64_is_signaling_nan(a, status)
5340          || float64_is_signaling_nan(b, status)) {
5341             float_raise(float_flag_invalid, status);
5342         }
5343         return 0;
5344     }
5345     aSign = extractFloat64Sign( a );
5346     bSign = extractFloat64Sign( b );
5347     av = float64_val(a);
5348     bv = float64_val(b);
5349     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5350     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5351 
5352 }
5353 
5354 /*----------------------------------------------------------------------------
5355 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5356 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5357 | comparison is performed according to the IEC/IEEE Standard for Binary
5358 | Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5360 
5361 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5362 {
5363     a = float64_squash_input_denormal(a, status);
5364     b = float64_squash_input_denormal(b, status);
5365 
5366     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5367          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5368        ) {
5369         if (float64_is_signaling_nan(a, status)
5370          || float64_is_signaling_nan(b, status)) {
5371             float_raise(float_flag_invalid, status);
5372         }
5373         return 1;
5374     }
5375     return 0;
5376 }
5377 
5378 /*----------------------------------------------------------------------------
5379 | Returns the result of converting the extended double-precision floating-
5380 | point value `a' to the 32-bit two's complement integer format.  The
5381 | conversion is performed according to the IEC/IEEE Standard for Binary
5382 | Floating-Point Arithmetic---which means in particular that the conversion
5383 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5384 | largest positive integer is returned.  Otherwise, if the conversion
5385 | overflows, the largest integer with the same sign as `a' is returned.
5386 *----------------------------------------------------------------------------*/
5387 
5388 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5389 {
5390     flag aSign;
5391     int32_t aExp, shiftCount;
5392     uint64_t aSig;
5393 
5394     if (floatx80_invalid_encoding(a)) {
5395         float_raise(float_flag_invalid, status);
5396         return 1 << 31;
5397     }
5398     aSig = extractFloatx80Frac( a );
5399     aExp = extractFloatx80Exp( a );
5400     aSign = extractFloatx80Sign( a );
5401     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5402     shiftCount = 0x4037 - aExp;
5403     if ( shiftCount <= 0 ) shiftCount = 1;
5404     shift64RightJamming( aSig, shiftCount, &aSig );
5405     return roundAndPackInt32(aSign, aSig, status);
5406 
5407 }
5408 
5409 /*----------------------------------------------------------------------------
5410 | Returns the result of converting the extended double-precision floating-
5411 | point value `a' to the 32-bit two's complement integer format.  The
5412 | conversion is performed according to the IEC/IEEE Standard for Binary
5413 | Floating-Point Arithmetic, except that the conversion is always rounded
5414 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5415 | Otherwise, if the conversion overflows, the largest integer with the same
5416 | sign as `a' is returned.
5417 *----------------------------------------------------------------------------*/
5418 
5419 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5420 {
5421     flag aSign;
5422     int32_t aExp, shiftCount;
5423     uint64_t aSig, savedASig;
5424     int32_t z;
5425 
5426     if (floatx80_invalid_encoding(a)) {
5427         float_raise(float_flag_invalid, status);
5428         return 1 << 31;
5429     }
5430     aSig = extractFloatx80Frac( a );
5431     aExp = extractFloatx80Exp( a );
5432     aSign = extractFloatx80Sign( a );
5433     if ( 0x401E < aExp ) {
5434         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5435         goto invalid;
5436     }
5437     else if ( aExp < 0x3FFF ) {
5438         if (aExp || aSig) {
5439             status->float_exception_flags |= float_flag_inexact;
5440         }
5441         return 0;
5442     }
5443     shiftCount = 0x403E - aExp;
5444     savedASig = aSig;
5445     aSig >>= shiftCount;
5446     z = aSig;
5447     if ( aSign ) z = - z;
5448     if ( ( z < 0 ) ^ aSign ) {
5449  invalid:
5450         float_raise(float_flag_invalid, status);
5451         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5452     }
5453     if ( ( aSig<<shiftCount ) != savedASig ) {
5454         status->float_exception_flags |= float_flag_inexact;
5455     }
5456     return z;
5457 
5458 }
5459 
5460 /*----------------------------------------------------------------------------
5461 | Returns the result of converting the extended double-precision floating-
5462 | point value `a' to the 64-bit two's complement integer format.  The
5463 | conversion is performed according to the IEC/IEEE Standard for Binary
5464 | Floating-Point Arithmetic---which means in particular that the conversion
5465 | is rounded according to the current rounding mode.  If `a' is a NaN,
5466 | the largest positive integer is returned.  Otherwise, if the conversion
5467 | overflows, the largest integer with the same sign as `a' is returned.
5468 *----------------------------------------------------------------------------*/
5469 
5470 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5471 {
5472     flag aSign;
5473     int32_t aExp, shiftCount;
5474     uint64_t aSig, aSigExtra;
5475 
5476     if (floatx80_invalid_encoding(a)) {
5477         float_raise(float_flag_invalid, status);
5478         return 1ULL << 63;
5479     }
5480     aSig = extractFloatx80Frac( a );
5481     aExp = extractFloatx80Exp( a );
5482     aSign = extractFloatx80Sign( a );
5483     shiftCount = 0x403E - aExp;
5484     if ( shiftCount <= 0 ) {
5485         if ( shiftCount ) {
5486             float_raise(float_flag_invalid, status);
5487             if (!aSign || floatx80_is_any_nan(a)) {
5488                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5489             }
5490             return (int64_t) LIT64( 0x8000000000000000 );
5491         }
5492         aSigExtra = 0;
5493     }
5494     else {
5495         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5496     }
5497     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5498 
5499 }
5500 
5501 /*----------------------------------------------------------------------------
5502 | Returns the result of converting the extended double-precision floating-
5503 | point value `a' to the 64-bit two's complement integer format.  The
5504 | conversion is performed according to the IEC/IEEE Standard for Binary
5505 | Floating-Point Arithmetic, except that the conversion is always rounded
5506 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5507 | Otherwise, if the conversion overflows, the largest integer with the same
5508 | sign as `a' is returned.
5509 *----------------------------------------------------------------------------*/
5510 
5511 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5512 {
5513     flag aSign;
5514     int32_t aExp, shiftCount;
5515     uint64_t aSig;
5516     int64_t z;
5517 
5518     if (floatx80_invalid_encoding(a)) {
5519         float_raise(float_flag_invalid, status);
5520         return 1ULL << 63;
5521     }
5522     aSig = extractFloatx80Frac( a );
5523     aExp = extractFloatx80Exp( a );
5524     aSign = extractFloatx80Sign( a );
5525     shiftCount = aExp - 0x403E;
5526     if ( 0 <= shiftCount ) {
5527         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5528         if ( ( a.high != 0xC03E ) || aSig ) {
5529             float_raise(float_flag_invalid, status);
5530             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5532             }
5533         }
5534         return (int64_t) LIT64( 0x8000000000000000 );
5535     }
5536     else if ( aExp < 0x3FFF ) {
5537         if (aExp | aSig) {
5538             status->float_exception_flags |= float_flag_inexact;
5539         }
5540         return 0;
5541     }
5542     z = aSig>>( - shiftCount );
5543     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5544         status->float_exception_flags |= float_flag_inexact;
5545     }
5546     if ( aSign ) z = - z;
5547     return z;
5548 
5549 }
5550 
5551 /*----------------------------------------------------------------------------
5552 | Returns the result of converting the extended double-precision floating-
5553 | point value `a' to the single-precision floating-point format.  The
5554 | conversion is performed according to the IEC/IEEE Standard for Binary
5555 | Floating-Point Arithmetic.
5556 *----------------------------------------------------------------------------*/
5557 
5558 float32 floatx80_to_float32(floatx80 a, float_status *status)
5559 {
5560     flag aSign;
5561     int32_t aExp;
5562     uint64_t aSig;
5563 
5564     if (floatx80_invalid_encoding(a)) {
5565         float_raise(float_flag_invalid, status);
5566         return float32_default_nan(status);
5567     }
5568     aSig = extractFloatx80Frac( a );
5569     aExp = extractFloatx80Exp( a );
5570     aSign = extractFloatx80Sign( a );
5571     if ( aExp == 0x7FFF ) {
5572         if ( (uint64_t) ( aSig<<1 ) ) {
5573             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5574         }
5575         return packFloat32( aSign, 0xFF, 0 );
5576     }
5577     shift64RightJamming( aSig, 33, &aSig );
5578     if ( aExp || aSig ) aExp -= 0x3F81;
5579     return roundAndPackFloat32(aSign, aExp, aSig, status);
5580 
5581 }
5582 
5583 /*----------------------------------------------------------------------------
5584 | Returns the result of converting the extended double-precision floating-
5585 | point value `a' to the double-precision floating-point format.  The
5586 | conversion is performed according to the IEC/IEEE Standard for Binary
5587 | Floating-Point Arithmetic.
5588 *----------------------------------------------------------------------------*/
5589 
5590 float64 floatx80_to_float64(floatx80 a, float_status *status)
5591 {
5592     flag aSign;
5593     int32_t aExp;
5594     uint64_t aSig, zSig;
5595 
5596     if (floatx80_invalid_encoding(a)) {
5597         float_raise(float_flag_invalid, status);
5598         return float64_default_nan(status);
5599     }
5600     aSig = extractFloatx80Frac( a );
5601     aExp = extractFloatx80Exp( a );
5602     aSign = extractFloatx80Sign( a );
5603     if ( aExp == 0x7FFF ) {
5604         if ( (uint64_t) ( aSig<<1 ) ) {
5605             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5606         }
5607         return packFloat64( aSign, 0x7FF, 0 );
5608     }
5609     shift64RightJamming( aSig, 1, &zSig );
5610     if ( aExp || aSig ) aExp -= 0x3C01;
5611     return roundAndPackFloat64(aSign, aExp, zSig, status);
5612 
5613 }
5614 
5615 /*----------------------------------------------------------------------------
5616 | Returns the result of converting the extended double-precision floating-
5617 | point value `a' to the quadruple-precision floating-point format.  The
5618 | conversion is performed according to the IEC/IEEE Standard for Binary
5619 | Floating-Point Arithmetic.
5620 *----------------------------------------------------------------------------*/
5621 
5622 float128 floatx80_to_float128(floatx80 a, float_status *status)
5623 {
5624     flag aSign;
5625     int aExp;
5626     uint64_t aSig, zSig0, zSig1;
5627 
5628     if (floatx80_invalid_encoding(a)) {
5629         float_raise(float_flag_invalid, status);
5630         return float128_default_nan(status);
5631     }
5632     aSig = extractFloatx80Frac( a );
5633     aExp = extractFloatx80Exp( a );
5634     aSign = extractFloatx80Sign( a );
5635     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5636         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5637     }
5638     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5639     return packFloat128( aSign, aExp, zSig0, zSig1 );
5640 
5641 }
5642 
5643 /*----------------------------------------------------------------------------
5644 | Rounds the extended double-precision floating-point value `a'
5645 | to the precision provided by floatx80_rounding_precision and returns the
5646 | result as an extended double-precision floating-point value.
5647 | The operation is performed according to the IEC/IEEE Standard for Binary
5648 | Floating-Point Arithmetic.
5649 *----------------------------------------------------------------------------*/
5650 
5651 floatx80 floatx80_round(floatx80 a, float_status *status)
5652 {
5653     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5654                                 extractFloatx80Sign(a),
5655                                 extractFloatx80Exp(a),
5656                                 extractFloatx80Frac(a), 0, status);
5657 }
5658 
5659 /*----------------------------------------------------------------------------
5660 | Rounds the extended double-precision floating-point value `a' to an integer,
5661 | and returns the result as an extended quadruple-precision floating-point
5662 | value.  The operation is performed according to the IEC/IEEE Standard for
5663 | Binary Floating-Point Arithmetic.
5664 *----------------------------------------------------------------------------*/
5665 
5666 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5667 {
5668     flag aSign;
5669     int32_t aExp;
5670     uint64_t lastBitMask, roundBitsMask;
5671     floatx80 z;
5672 
5673     if (floatx80_invalid_encoding(a)) {
5674         float_raise(float_flag_invalid, status);
5675         return floatx80_default_nan(status);
5676     }
5677     aExp = extractFloatx80Exp( a );
5678     if ( 0x403E <= aExp ) {
5679         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5680             return propagateFloatx80NaN(a, a, status);
5681         }
5682         return a;
5683     }
5684     if ( aExp < 0x3FFF ) {
5685         if (    ( aExp == 0 )
5686              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5687             return a;
5688         }
5689         status->float_exception_flags |= float_flag_inexact;
5690         aSign = extractFloatx80Sign( a );
5691         switch (status->float_rounding_mode) {
5692          case float_round_nearest_even:
5693             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5694                ) {
5695                 return
5696                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5697             }
5698             break;
5699         case float_round_ties_away:
5700             if (aExp == 0x3FFE) {
5701                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5702             }
5703             break;
5704          case float_round_down:
5705             return
5706                   aSign ?
5707                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5708                 : packFloatx80( 0, 0, 0 );
5709          case float_round_up:
5710             return
5711                   aSign ? packFloatx80( 1, 0, 0 )
5712                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5713         }
5714         return packFloatx80( aSign, 0, 0 );
5715     }
5716     lastBitMask = 1;
5717     lastBitMask <<= 0x403E - aExp;
5718     roundBitsMask = lastBitMask - 1;
5719     z = a;
5720     switch (status->float_rounding_mode) {
5721     case float_round_nearest_even:
5722         z.low += lastBitMask>>1;
5723         if ((z.low & roundBitsMask) == 0) {
5724             z.low &= ~lastBitMask;
5725         }
5726         break;
5727     case float_round_ties_away:
5728         z.low += lastBitMask >> 1;
5729         break;
5730     case float_round_to_zero:
5731         break;
5732     case float_round_up:
5733         if (!extractFloatx80Sign(z)) {
5734             z.low += roundBitsMask;
5735         }
5736         break;
5737     case float_round_down:
5738         if (extractFloatx80Sign(z)) {
5739             z.low += roundBitsMask;
5740         }
5741         break;
5742     default:
5743         abort();
5744     }
5745     z.low &= ~ roundBitsMask;
5746     if ( z.low == 0 ) {
5747         ++z.high;
5748         z.low = LIT64( 0x8000000000000000 );
5749     }
5750     if (z.low != a.low) {
5751         status->float_exception_flags |= float_flag_inexact;
5752     }
5753     return z;
5754 
5755 }
5756 
5757 /*----------------------------------------------------------------------------
5758 | Returns the result of adding the absolute values of the extended double-
5759 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5760 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5761 | The addition is performed according to the IEC/IEEE Standard for Binary
5762 | Floating-Point Arithmetic.
5763 *----------------------------------------------------------------------------*/
5764 
5765 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5766                                 float_status *status)
5767 {
5768     int32_t aExp, bExp, zExp;
5769     uint64_t aSig, bSig, zSig0, zSig1;
5770     int32_t expDiff;
5771 
5772     aSig = extractFloatx80Frac( a );
5773     aExp = extractFloatx80Exp( a );
5774     bSig = extractFloatx80Frac( b );
5775     bExp = extractFloatx80Exp( b );
5776     expDiff = aExp - bExp;
5777     if ( 0 < expDiff ) {
5778         if ( aExp == 0x7FFF ) {
5779             if ((uint64_t)(aSig << 1)) {
5780                 return propagateFloatx80NaN(a, b, status);
5781             }
5782             return a;
5783         }
5784         if ( bExp == 0 ) --expDiff;
5785         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5786         zExp = aExp;
5787     }
5788     else if ( expDiff < 0 ) {
5789         if ( bExp == 0x7FFF ) {
5790             if ((uint64_t)(bSig << 1)) {
5791                 return propagateFloatx80NaN(a, b, status);
5792             }
5793             return packFloatx80(zSign,
5794                                 floatx80_infinity_high,
5795                                 floatx80_infinity_low);
5796         }
5797         if ( aExp == 0 ) ++expDiff;
5798         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5799         zExp = bExp;
5800     }
5801     else {
5802         if ( aExp == 0x7FFF ) {
5803             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5804                 return propagateFloatx80NaN(a, b, status);
5805             }
5806             return a;
5807         }
5808         zSig1 = 0;
5809         zSig0 = aSig + bSig;
5810         if ( aExp == 0 ) {
5811             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5812             goto roundAndPack;
5813         }
5814         zExp = aExp;
5815         goto shiftRight1;
5816     }
5817     zSig0 = aSig + bSig;
5818     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5819  shiftRight1:
5820     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5821     zSig0 |= LIT64( 0x8000000000000000 );
5822     ++zExp;
5823  roundAndPack:
5824     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5825                                 zSign, zExp, zSig0, zSig1, status);
5826 }
5827 
5828 /*----------------------------------------------------------------------------
5829 | Returns the result of subtracting the absolute values of the extended
5830 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5831 | difference is negated before being returned.  `zSign' is ignored if the
5832 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5833 | Standard for Binary Floating-Point Arithmetic.
5834 *----------------------------------------------------------------------------*/
5835 
5836 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5837                                 float_status *status)
5838 {
5839     int32_t aExp, bExp, zExp;
5840     uint64_t aSig, bSig, zSig0, zSig1;
5841     int32_t expDiff;
5842 
5843     aSig = extractFloatx80Frac( a );
5844     aExp = extractFloatx80Exp( a );
5845     bSig = extractFloatx80Frac( b );
5846     bExp = extractFloatx80Exp( b );
5847     expDiff = aExp - bExp;
5848     if ( 0 < expDiff ) goto aExpBigger;
5849     if ( expDiff < 0 ) goto bExpBigger;
5850     if ( aExp == 0x7FFF ) {
5851         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5852             return propagateFloatx80NaN(a, b, status);
5853         }
5854         float_raise(float_flag_invalid, status);
5855         return floatx80_default_nan(status);
5856     }
5857     if ( aExp == 0 ) {
5858         aExp = 1;
5859         bExp = 1;
5860     }
5861     zSig1 = 0;
5862     if ( bSig < aSig ) goto aBigger;
5863     if ( aSig < bSig ) goto bBigger;
5864     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5865  bExpBigger:
5866     if ( bExp == 0x7FFF ) {
5867         if ((uint64_t)(bSig << 1)) {
5868             return propagateFloatx80NaN(a, b, status);
5869         }
5870         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5871                             floatx80_infinity_low);
5872     }
5873     if ( aExp == 0 ) ++expDiff;
5874     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5875  bBigger:
5876     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5877     zExp = bExp;
5878     zSign ^= 1;
5879     goto normalizeRoundAndPack;
5880  aExpBigger:
5881     if ( aExp == 0x7FFF ) {
5882         if ((uint64_t)(aSig << 1)) {
5883             return propagateFloatx80NaN(a, b, status);
5884         }
5885         return a;
5886     }
5887     if ( bExp == 0 ) --expDiff;
5888     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5889  aBigger:
5890     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5891     zExp = aExp;
5892  normalizeRoundAndPack:
5893     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5894                                          zSign, zExp, zSig0, zSig1, status);
5895 }
5896 
5897 /*----------------------------------------------------------------------------
5898 | Returns the result of adding the extended double-precision floating-point
5899 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5900 | Standard for Binary Floating-Point Arithmetic.
5901 *----------------------------------------------------------------------------*/
5902 
5903 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5904 {
5905     flag aSign, bSign;
5906 
5907     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5908         float_raise(float_flag_invalid, status);
5909         return floatx80_default_nan(status);
5910     }
5911     aSign = extractFloatx80Sign( a );
5912     bSign = extractFloatx80Sign( b );
5913     if ( aSign == bSign ) {
5914         return addFloatx80Sigs(a, b, aSign, status);
5915     }
5916     else {
5917         return subFloatx80Sigs(a, b, aSign, status);
5918     }
5919 
5920 }
5921 
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of subtracting the extended double-precision floating-
5924 | point values `a' and `b'.  The operation is performed according to the
5925 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5926 *----------------------------------------------------------------------------*/
5927 
5928 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5929 {
5930     flag aSign, bSign;
5931 
5932     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5933         float_raise(float_flag_invalid, status);
5934         return floatx80_default_nan(status);
5935     }
5936     aSign = extractFloatx80Sign( a );
5937     bSign = extractFloatx80Sign( b );
5938     if ( aSign == bSign ) {
5939         return subFloatx80Sigs(a, b, aSign, status);
5940     }
5941     else {
5942         return addFloatx80Sigs(a, b, aSign, status);
5943     }
5944 
5945 }
5946 
5947 /*----------------------------------------------------------------------------
5948 | Returns the result of multiplying the extended double-precision floating-
5949 | point values `a' and `b'.  The operation is performed according to the
5950 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5951 *----------------------------------------------------------------------------*/
5952 
5953 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5954 {
5955     flag aSign, bSign, zSign;
5956     int32_t aExp, bExp, zExp;
5957     uint64_t aSig, bSig, zSig0, zSig1;
5958 
5959     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5960         float_raise(float_flag_invalid, status);
5961         return floatx80_default_nan(status);
5962     }
5963     aSig = extractFloatx80Frac( a );
5964     aExp = extractFloatx80Exp( a );
5965     aSign = extractFloatx80Sign( a );
5966     bSig = extractFloatx80Frac( b );
5967     bExp = extractFloatx80Exp( b );
5968     bSign = extractFloatx80Sign( b );
5969     zSign = aSign ^ bSign;
5970     if ( aExp == 0x7FFF ) {
5971         if (    (uint64_t) ( aSig<<1 )
5972              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5973             return propagateFloatx80NaN(a, b, status);
5974         }
5975         if ( ( bExp | bSig ) == 0 ) goto invalid;
5976         return packFloatx80(zSign, floatx80_infinity_high,
5977                                    floatx80_infinity_low);
5978     }
5979     if ( bExp == 0x7FFF ) {
5980         if ((uint64_t)(bSig << 1)) {
5981             return propagateFloatx80NaN(a, b, status);
5982         }
5983         if ( ( aExp | aSig ) == 0 ) {
5984  invalid:
5985             float_raise(float_flag_invalid, status);
5986             return floatx80_default_nan(status);
5987         }
5988         return packFloatx80(zSign, floatx80_infinity_high,
5989                                    floatx80_infinity_low);
5990     }
5991     if ( aExp == 0 ) {
5992         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5993         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5994     }
5995     if ( bExp == 0 ) {
5996         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5997         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5998     }
5999     zExp = aExp + bExp - 0x3FFE;
6000     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6001     if ( 0 < (int64_t) zSig0 ) {
6002         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6003         --zExp;
6004     }
6005     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6006                                 zSign, zExp, zSig0, zSig1, status);
6007 }
6008 
6009 /*----------------------------------------------------------------------------
6010 | Returns the result of dividing the extended double-precision floating-point
6011 | value `a' by the corresponding value `b'.  The operation is performed
6012 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6013 *----------------------------------------------------------------------------*/
6014 
6015 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6016 {
6017     flag aSign, bSign, zSign;
6018     int32_t aExp, bExp, zExp;
6019     uint64_t aSig, bSig, zSig0, zSig1;
6020     uint64_t rem0, rem1, rem2, term0, term1, term2;
6021 
6022     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6023         float_raise(float_flag_invalid, status);
6024         return floatx80_default_nan(status);
6025     }
6026     aSig = extractFloatx80Frac( a );
6027     aExp = extractFloatx80Exp( a );
6028     aSign = extractFloatx80Sign( a );
6029     bSig = extractFloatx80Frac( b );
6030     bExp = extractFloatx80Exp( b );
6031     bSign = extractFloatx80Sign( b );
6032     zSign = aSign ^ bSign;
6033     if ( aExp == 0x7FFF ) {
6034         if ((uint64_t)(aSig << 1)) {
6035             return propagateFloatx80NaN(a, b, status);
6036         }
6037         if ( bExp == 0x7FFF ) {
6038             if ((uint64_t)(bSig << 1)) {
6039                 return propagateFloatx80NaN(a, b, status);
6040             }
6041             goto invalid;
6042         }
6043         return packFloatx80(zSign, floatx80_infinity_high,
6044                                    floatx80_infinity_low);
6045     }
6046     if ( bExp == 0x7FFF ) {
6047         if ((uint64_t)(bSig << 1)) {
6048             return propagateFloatx80NaN(a, b, status);
6049         }
6050         return packFloatx80( zSign, 0, 0 );
6051     }
6052     if ( bExp == 0 ) {
6053         if ( bSig == 0 ) {
6054             if ( ( aExp | aSig ) == 0 ) {
6055  invalid:
6056                 float_raise(float_flag_invalid, status);
6057                 return floatx80_default_nan(status);
6058             }
6059             float_raise(float_flag_divbyzero, status);
6060             return packFloatx80(zSign, floatx80_infinity_high,
6061                                        floatx80_infinity_low);
6062         }
6063         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6064     }
6065     if ( aExp == 0 ) {
6066         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6067         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6068     }
6069     zExp = aExp - bExp + 0x3FFE;
6070     rem1 = 0;
6071     if ( bSig <= aSig ) {
6072         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6073         ++zExp;
6074     }
6075     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6076     mul64To128( bSig, zSig0, &term0, &term1 );
6077     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6078     while ( (int64_t) rem0 < 0 ) {
6079         --zSig0;
6080         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6081     }
6082     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6083     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6084         mul64To128( bSig, zSig1, &term1, &term2 );
6085         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6086         while ( (int64_t) rem1 < 0 ) {
6087             --zSig1;
6088             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6089         }
6090         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6091     }
6092     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6093                                 zSign, zExp, zSig0, zSig1, status);
6094 }
6095 
6096 /*----------------------------------------------------------------------------
6097 | Returns the remainder of the extended double-precision floating-point value
6098 | `a' with respect to the corresponding value `b'.  The operation is performed
6099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6100 *----------------------------------------------------------------------------*/
6101 
6102 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6103 {
6104     flag aSign, zSign;
6105     int32_t aExp, bExp, expDiff;
6106     uint64_t aSig0, aSig1, bSig;
6107     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6108 
6109     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6110         float_raise(float_flag_invalid, status);
6111         return floatx80_default_nan(status);
6112     }
6113     aSig0 = extractFloatx80Frac( a );
6114     aExp = extractFloatx80Exp( a );
6115     aSign = extractFloatx80Sign( a );
6116     bSig = extractFloatx80Frac( b );
6117     bExp = extractFloatx80Exp( b );
6118     if ( aExp == 0x7FFF ) {
6119         if (    (uint64_t) ( aSig0<<1 )
6120              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6121             return propagateFloatx80NaN(a, b, status);
6122         }
6123         goto invalid;
6124     }
6125     if ( bExp == 0x7FFF ) {
6126         if ((uint64_t)(bSig << 1)) {
6127             return propagateFloatx80NaN(a, b, status);
6128         }
6129         return a;
6130     }
6131     if ( bExp == 0 ) {
6132         if ( bSig == 0 ) {
6133  invalid:
6134             float_raise(float_flag_invalid, status);
6135             return floatx80_default_nan(status);
6136         }
6137         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6138     }
6139     if ( aExp == 0 ) {
6140         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6141         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6142     }
6143     bSig |= LIT64( 0x8000000000000000 );
6144     zSign = aSign;
6145     expDiff = aExp - bExp;
6146     aSig1 = 0;
6147     if ( expDiff < 0 ) {
6148         if ( expDiff < -1 ) return a;
6149         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6150         expDiff = 0;
6151     }
6152     q = ( bSig <= aSig0 );
6153     if ( q ) aSig0 -= bSig;
6154     expDiff -= 64;
6155     while ( 0 < expDiff ) {
6156         q = estimateDiv128To64( aSig0, aSig1, bSig );
6157         q = ( 2 < q ) ? q - 2 : 0;
6158         mul64To128( bSig, q, &term0, &term1 );
6159         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6160         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6161         expDiff -= 62;
6162     }
6163     expDiff += 64;
6164     if ( 0 < expDiff ) {
6165         q = estimateDiv128To64( aSig0, aSig1, bSig );
6166         q = ( 2 < q ) ? q - 2 : 0;
6167         q >>= 64 - expDiff;
6168         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6169         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6170         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6171         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6172             ++q;
6173             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6174         }
6175     }
6176     else {
6177         term1 = 0;
6178         term0 = bSig;
6179     }
6180     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6181     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6182          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6183               && ( q & 1 ) )
6184        ) {
6185         aSig0 = alternateASig0;
6186         aSig1 = alternateASig1;
6187         zSign = ! zSign;
6188     }
6189     return
6190         normalizeRoundAndPackFloatx80(
6191             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6192 
6193 }
6194 
6195 /*----------------------------------------------------------------------------
6196 | Returns the square root of the extended double-precision floating-point
6197 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6198 | for Binary Floating-Point Arithmetic.
6199 *----------------------------------------------------------------------------*/
6200 
6201 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6202 {
6203     flag aSign;
6204     int32_t aExp, zExp;
6205     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6206     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6207 
6208     if (floatx80_invalid_encoding(a)) {
6209         float_raise(float_flag_invalid, status);
6210         return floatx80_default_nan(status);
6211     }
6212     aSig0 = extractFloatx80Frac( a );
6213     aExp = extractFloatx80Exp( a );
6214     aSign = extractFloatx80Sign( a );
6215     if ( aExp == 0x7FFF ) {
6216         if ((uint64_t)(aSig0 << 1)) {
6217             return propagateFloatx80NaN(a, a, status);
6218         }
6219         if ( ! aSign ) return a;
6220         goto invalid;
6221     }
6222     if ( aSign ) {
6223         if ( ( aExp | aSig0 ) == 0 ) return a;
6224  invalid:
6225         float_raise(float_flag_invalid, status);
6226         return floatx80_default_nan(status);
6227     }
6228     if ( aExp == 0 ) {
6229         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6230         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6231     }
6232     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6233     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6234     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6235     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6236     doubleZSig0 = zSig0<<1;
6237     mul64To128( zSig0, zSig0, &term0, &term1 );
6238     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6239     while ( (int64_t) rem0 < 0 ) {
6240         --zSig0;
6241         doubleZSig0 -= 2;
6242         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6243     }
6244     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6245     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6246         if ( zSig1 == 0 ) zSig1 = 1;
6247         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6248         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6249         mul64To128( zSig1, zSig1, &term2, &term3 );
6250         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6251         while ( (int64_t) rem1 < 0 ) {
6252             --zSig1;
6253             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6254             term3 |= 1;
6255             term2 |= doubleZSig0;
6256             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6257         }
6258         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6259     }
6260     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6261     zSig0 |= doubleZSig0;
6262     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6263                                 0, zExp, zSig0, zSig1, status);
6264 }
6265 
6266 /*----------------------------------------------------------------------------
6267 | Returns 1 if the extended double-precision floating-point value `a' is equal
6268 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6269 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6270 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6271 *----------------------------------------------------------------------------*/
6272 
6273 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6274 {
6275 
6276     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6277         || (extractFloatx80Exp(a) == 0x7FFF
6278             && (uint64_t) (extractFloatx80Frac(a) << 1))
6279         || (extractFloatx80Exp(b) == 0x7FFF
6280             && (uint64_t) (extractFloatx80Frac(b) << 1))
6281        ) {
6282         float_raise(float_flag_invalid, status);
6283         return 0;
6284     }
6285     return
6286            ( a.low == b.low )
6287         && (    ( a.high == b.high )
6288              || (    ( a.low == 0 )
6289                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6290            );
6291 
6292 }
6293 
6294 /*----------------------------------------------------------------------------
6295 | Returns 1 if the extended double-precision floating-point value `a' is
6296 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6297 | invalid exception is raised if either operand is a NaN.  The comparison is
6298 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6299 | Arithmetic.
6300 *----------------------------------------------------------------------------*/
6301 
6302 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6303 {
6304     flag aSign, bSign;
6305 
6306     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6307         || (extractFloatx80Exp(a) == 0x7FFF
6308             && (uint64_t) (extractFloatx80Frac(a) << 1))
6309         || (extractFloatx80Exp(b) == 0x7FFF
6310             && (uint64_t) (extractFloatx80Frac(b) << 1))
6311        ) {
6312         float_raise(float_flag_invalid, status);
6313         return 0;
6314     }
6315     aSign = extractFloatx80Sign( a );
6316     bSign = extractFloatx80Sign( b );
6317     if ( aSign != bSign ) {
6318         return
6319                aSign
6320             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6321                  == 0 );
6322     }
6323     return
6324           aSign ? le128( b.high, b.low, a.high, a.low )
6325         : le128( a.high, a.low, b.high, b.low );
6326 
6327 }
6328 
6329 /*----------------------------------------------------------------------------
6330 | Returns 1 if the extended double-precision floating-point value `a' is
6331 | less than the corresponding value `b', and 0 otherwise.  The invalid
6332 | exception is raised if either operand is a NaN.  The comparison is performed
6333 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6335 
6336 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6337 {
6338     flag aSign, bSign;
6339 
6340     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6341         || (extractFloatx80Exp(a) == 0x7FFF
6342             && (uint64_t) (extractFloatx80Frac(a) << 1))
6343         || (extractFloatx80Exp(b) == 0x7FFF
6344             && (uint64_t) (extractFloatx80Frac(b) << 1))
6345        ) {
6346         float_raise(float_flag_invalid, status);
6347         return 0;
6348     }
6349     aSign = extractFloatx80Sign( a );
6350     bSign = extractFloatx80Sign( b );
6351     if ( aSign != bSign ) {
6352         return
6353                aSign
6354             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6355                  != 0 );
6356     }
6357     return
6358           aSign ? lt128( b.high, b.low, a.high, a.low )
6359         : lt128( a.high, a.low, b.high, b.low );
6360 
6361 }
6362 
6363 /*----------------------------------------------------------------------------
6364 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6365 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6366 | either operand is a NaN.   The comparison is performed according to the
6367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6368 *----------------------------------------------------------------------------*/
6369 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6370 {
6371     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6372         || (extractFloatx80Exp(a) == 0x7FFF
6373             && (uint64_t) (extractFloatx80Frac(a) << 1))
6374         || (extractFloatx80Exp(b) == 0x7FFF
6375             && (uint64_t) (extractFloatx80Frac(b) << 1))
6376        ) {
6377         float_raise(float_flag_invalid, status);
6378         return 1;
6379     }
6380     return 0;
6381 }
6382 
6383 /*----------------------------------------------------------------------------
6384 | Returns 1 if the extended double-precision floating-point value `a' is
6385 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6386 | cause an exception.  The comparison is performed according to the IEC/IEEE
6387 | Standard for Binary Floating-Point Arithmetic.
6388 *----------------------------------------------------------------------------*/
6389 
6390 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6391 {
6392 
6393     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6394         float_raise(float_flag_invalid, status);
6395         return 0;
6396     }
6397     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6398               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6399          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6400               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6401        ) {
6402         if (floatx80_is_signaling_nan(a, status)
6403          || floatx80_is_signaling_nan(b, status)) {
6404             float_raise(float_flag_invalid, status);
6405         }
6406         return 0;
6407     }
6408     return
6409            ( a.low == b.low )
6410         && (    ( a.high == b.high )
6411              || (    ( a.low == 0 )
6412                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6413            );
6414 
6415 }
6416 
6417 /*----------------------------------------------------------------------------
6418 | Returns 1 if the extended double-precision floating-point value `a' is less
6419 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6420 | do not cause an exception.  Otherwise, the comparison is performed according
6421 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6422 *----------------------------------------------------------------------------*/
6423 
6424 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6425 {
6426     flag aSign, bSign;
6427 
6428     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6429         float_raise(float_flag_invalid, status);
6430         return 0;
6431     }
6432     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6433               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6434          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6435               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6436        ) {
6437         if (floatx80_is_signaling_nan(a, status)
6438          || floatx80_is_signaling_nan(b, status)) {
6439             float_raise(float_flag_invalid, status);
6440         }
6441         return 0;
6442     }
6443     aSign = extractFloatx80Sign( a );
6444     bSign = extractFloatx80Sign( b );
6445     if ( aSign != bSign ) {
6446         return
6447                aSign
6448             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6449                  == 0 );
6450     }
6451     return
6452           aSign ? le128( b.high, b.low, a.high, a.low )
6453         : le128( a.high, a.low, b.high, b.low );
6454 
6455 }
6456 
6457 /*----------------------------------------------------------------------------
6458 | Returns 1 if the extended double-precision floating-point value `a' is less
6459 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6460 | an exception.  Otherwise, the comparison is performed according to the
6461 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6462 *----------------------------------------------------------------------------*/
6463 
6464 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6465 {
6466     flag aSign, bSign;
6467 
6468     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6469         float_raise(float_flag_invalid, status);
6470         return 0;
6471     }
6472     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6473               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6474          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6475               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6476        ) {
6477         if (floatx80_is_signaling_nan(a, status)
6478          || floatx80_is_signaling_nan(b, status)) {
6479             float_raise(float_flag_invalid, status);
6480         }
6481         return 0;
6482     }
6483     aSign = extractFloatx80Sign( a );
6484     bSign = extractFloatx80Sign( b );
6485     if ( aSign != bSign ) {
6486         return
6487                aSign
6488             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6489                  != 0 );
6490     }
6491     return
6492           aSign ? lt128( b.high, b.low, a.high, a.low )
6493         : lt128( a.high, a.low, b.high, b.low );
6494 
6495 }
6496 
6497 /*----------------------------------------------------------------------------
6498 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6499 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6500 | The comparison is performed according to the IEC/IEEE Standard for Binary
6501 | Floating-Point Arithmetic.
6502 *----------------------------------------------------------------------------*/
6503 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6504 {
6505     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6506         float_raise(float_flag_invalid, status);
6507         return 1;
6508     }
6509     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6510               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6511          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6512               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6513        ) {
6514         if (floatx80_is_signaling_nan(a, status)
6515          || floatx80_is_signaling_nan(b, status)) {
6516             float_raise(float_flag_invalid, status);
6517         }
6518         return 1;
6519     }
6520     return 0;
6521 }
6522 
6523 /*----------------------------------------------------------------------------
6524 | Returns the result of converting the quadruple-precision floating-point
6525 | value `a' to the 32-bit two's complement integer format.  The conversion
6526 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6527 | Arithmetic---which means in particular that the conversion is rounded
6528 | according to the current rounding mode.  If `a' is a NaN, the largest
6529 | positive integer is returned.  Otherwise, if the conversion overflows, the
6530 | largest integer with the same sign as `a' is returned.
6531 *----------------------------------------------------------------------------*/
6532 
6533 int32_t float128_to_int32(float128 a, float_status *status)
6534 {
6535     flag aSign;
6536     int32_t aExp, shiftCount;
6537     uint64_t aSig0, aSig1;
6538 
6539     aSig1 = extractFloat128Frac1( a );
6540     aSig0 = extractFloat128Frac0( a );
6541     aExp = extractFloat128Exp( a );
6542     aSign = extractFloat128Sign( a );
6543     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6544     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6545     aSig0 |= ( aSig1 != 0 );
6546     shiftCount = 0x4028 - aExp;
6547     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6548     return roundAndPackInt32(aSign, aSig0, status);
6549 
6550 }
6551 
6552 /*----------------------------------------------------------------------------
6553 | Returns the result of converting the quadruple-precision floating-point
6554 | value `a' to the 32-bit two's complement integer format.  The conversion
6555 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6556 | Arithmetic, except that the conversion is always rounded toward zero.  If
6557 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6558 | conversion overflows, the largest integer with the same sign as `a' is
6559 | returned.
6560 *----------------------------------------------------------------------------*/
6561 
6562 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6563 {
6564     flag aSign;
6565     int32_t aExp, shiftCount;
6566     uint64_t aSig0, aSig1, savedASig;
6567     int32_t z;
6568 
6569     aSig1 = extractFloat128Frac1( a );
6570     aSig0 = extractFloat128Frac0( a );
6571     aExp = extractFloat128Exp( a );
6572     aSign = extractFloat128Sign( a );
6573     aSig0 |= ( aSig1 != 0 );
6574     if ( 0x401E < aExp ) {
6575         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6576         goto invalid;
6577     }
6578     else if ( aExp < 0x3FFF ) {
6579         if (aExp || aSig0) {
6580             status->float_exception_flags |= float_flag_inexact;
6581         }
6582         return 0;
6583     }
6584     aSig0 |= LIT64( 0x0001000000000000 );
6585     shiftCount = 0x402F - aExp;
6586     savedASig = aSig0;
6587     aSig0 >>= shiftCount;
6588     z = aSig0;
6589     if ( aSign ) z = - z;
6590     if ( ( z < 0 ) ^ aSign ) {
6591  invalid:
6592         float_raise(float_flag_invalid, status);
6593         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6594     }
6595     if ( ( aSig0<<shiftCount ) != savedASig ) {
6596         status->float_exception_flags |= float_flag_inexact;
6597     }
6598     return z;
6599 
6600 }
6601 
6602 /*----------------------------------------------------------------------------
6603 | Returns the result of converting the quadruple-precision floating-point
6604 | value `a' to the 64-bit two's complement integer format.  The conversion
6605 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6606 | Arithmetic---which means in particular that the conversion is rounded
6607 | according to the current rounding mode.  If `a' is a NaN, the largest
6608 | positive integer is returned.  Otherwise, if the conversion overflows, the
6609 | largest integer with the same sign as `a' is returned.
6610 *----------------------------------------------------------------------------*/
6611 
6612 int64_t float128_to_int64(float128 a, float_status *status)
6613 {
6614     flag aSign;
6615     int32_t aExp, shiftCount;
6616     uint64_t aSig0, aSig1;
6617 
6618     aSig1 = extractFloat128Frac1( a );
6619     aSig0 = extractFloat128Frac0( a );
6620     aExp = extractFloat128Exp( a );
6621     aSign = extractFloat128Sign( a );
6622     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6623     shiftCount = 0x402F - aExp;
6624     if ( shiftCount <= 0 ) {
6625         if ( 0x403E < aExp ) {
6626             float_raise(float_flag_invalid, status);
6627             if (    ! aSign
6628                  || (    ( aExp == 0x7FFF )
6629                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6630                     )
6631                ) {
6632                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6633             }
6634             return (int64_t) LIT64( 0x8000000000000000 );
6635         }
6636         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6637     }
6638     else {
6639         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6640     }
6641     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6642 
6643 }
6644 
6645 /*----------------------------------------------------------------------------
6646 | Returns the result of converting the quadruple-precision floating-point
6647 | value `a' to the 64-bit two's complement integer format.  The conversion
6648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6649 | Arithmetic, except that the conversion is always rounded toward zero.
6650 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6651 | the conversion overflows, the largest integer with the same sign as `a' is
6652 | returned.
6653 *----------------------------------------------------------------------------*/
6654 
6655 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6656 {
6657     flag aSign;
6658     int32_t aExp, shiftCount;
6659     uint64_t aSig0, aSig1;
6660     int64_t z;
6661 
6662     aSig1 = extractFloat128Frac1( a );
6663     aSig0 = extractFloat128Frac0( a );
6664     aExp = extractFloat128Exp( a );
6665     aSign = extractFloat128Sign( a );
6666     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6667     shiftCount = aExp - 0x402F;
6668     if ( 0 < shiftCount ) {
6669         if ( 0x403E <= aExp ) {
6670             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6671             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6672                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6673                 if (aSig1) {
6674                     status->float_exception_flags |= float_flag_inexact;
6675                 }
6676             }
6677             else {
6678                 float_raise(float_flag_invalid, status);
6679                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6680                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6681                 }
6682             }
6683             return (int64_t) LIT64( 0x8000000000000000 );
6684         }
6685         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6686         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6687             status->float_exception_flags |= float_flag_inexact;
6688         }
6689     }
6690     else {
6691         if ( aExp < 0x3FFF ) {
6692             if ( aExp | aSig0 | aSig1 ) {
6693                 status->float_exception_flags |= float_flag_inexact;
6694             }
6695             return 0;
6696         }
6697         z = aSig0>>( - shiftCount );
6698         if (    aSig1
6699              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6700             status->float_exception_flags |= float_flag_inexact;
6701         }
6702     }
6703     if ( aSign ) z = - z;
6704     return z;
6705 
6706 }
6707 
6708 /*----------------------------------------------------------------------------
6709 | Returns the result of converting the quadruple-precision floating-point value
6710 | `a' to the 64-bit unsigned integer format.  The conversion is
6711 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6712 | Arithmetic---which means in particular that the conversion is rounded
6713 | according to the current rounding mode.  If `a' is a NaN, the largest
6714 | positive integer is returned.  If the conversion overflows, the
6715 | largest unsigned integer is returned.  If 'a' is negative, the value is
6716 | rounded and zero is returned; negative values that do not round to zero
6717 | will raise the inexact exception.
6718 *----------------------------------------------------------------------------*/
6719 
6720 uint64_t float128_to_uint64(float128 a, float_status *status)
6721 {
6722     flag aSign;
6723     int aExp;
6724     int shiftCount;
6725     uint64_t aSig0, aSig1;
6726 
6727     aSig0 = extractFloat128Frac0(a);
6728     aSig1 = extractFloat128Frac1(a);
6729     aExp = extractFloat128Exp(a);
6730     aSign = extractFloat128Sign(a);
6731     if (aSign && (aExp > 0x3FFE)) {
6732         float_raise(float_flag_invalid, status);
6733         if (float128_is_any_nan(a)) {
6734             return LIT64(0xFFFFFFFFFFFFFFFF);
6735         } else {
6736             return 0;
6737         }
6738     }
6739     if (aExp) {
6740         aSig0 |= LIT64(0x0001000000000000);
6741     }
6742     shiftCount = 0x402F - aExp;
6743     if (shiftCount <= 0) {
6744         if (0x403E < aExp) {
6745             float_raise(float_flag_invalid, status);
6746             return LIT64(0xFFFFFFFFFFFFFFFF);
6747         }
6748         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6749     } else {
6750         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6751     }
6752     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6753 }
6754 
6755 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6756 {
6757     uint64_t v;
6758     signed char current_rounding_mode = status->float_rounding_mode;
6759 
6760     set_float_rounding_mode(float_round_to_zero, status);
6761     v = float128_to_uint64(a, status);
6762     set_float_rounding_mode(current_rounding_mode, status);
6763 
6764     return v;
6765 }
6766 
6767 /*----------------------------------------------------------------------------
6768 | Returns the result of converting the quadruple-precision floating-point
6769 | value `a' to the 32-bit unsigned integer format.  The conversion
6770 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6771 | Arithmetic except that the conversion is always rounded toward zero.
6772 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6773 | if the conversion overflows, the largest unsigned integer is returned.
6774 | If 'a' is negative, the value is rounded and zero is returned; negative
6775 | values that do not round to zero will raise the inexact exception.
6776 *----------------------------------------------------------------------------*/
6777 
6778 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6779 {
6780     uint64_t v;
6781     uint32_t res;
6782     int old_exc_flags = get_float_exception_flags(status);
6783 
6784     v = float128_to_uint64_round_to_zero(a, status);
6785     if (v > 0xffffffff) {
6786         res = 0xffffffff;
6787     } else {
6788         return v;
6789     }
6790     set_float_exception_flags(old_exc_flags, status);
6791     float_raise(float_flag_invalid, status);
6792     return res;
6793 }
6794 
6795 /*----------------------------------------------------------------------------
6796 | Returns the result of converting the quadruple-precision floating-point
6797 | value `a' to the single-precision floating-point format.  The conversion
6798 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6799 | Arithmetic.
6800 *----------------------------------------------------------------------------*/
6801 
6802 float32 float128_to_float32(float128 a, float_status *status)
6803 {
6804     flag aSign;
6805     int32_t aExp;
6806     uint64_t aSig0, aSig1;
6807     uint32_t zSig;
6808 
6809     aSig1 = extractFloat128Frac1( a );
6810     aSig0 = extractFloat128Frac0( a );
6811     aExp = extractFloat128Exp( a );
6812     aSign = extractFloat128Sign( a );
6813     if ( aExp == 0x7FFF ) {
6814         if ( aSig0 | aSig1 ) {
6815             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6816         }
6817         return packFloat32( aSign, 0xFF, 0 );
6818     }
6819     aSig0 |= ( aSig1 != 0 );
6820     shift64RightJamming( aSig0, 18, &aSig0 );
6821     zSig = aSig0;
6822     if ( aExp || zSig ) {
6823         zSig |= 0x40000000;
6824         aExp -= 0x3F81;
6825     }
6826     return roundAndPackFloat32(aSign, aExp, zSig, status);
6827 
6828 }
6829 
6830 /*----------------------------------------------------------------------------
6831 | Returns the result of converting the quadruple-precision floating-point
6832 | value `a' to the double-precision floating-point format.  The conversion
6833 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6834 | Arithmetic.
6835 *----------------------------------------------------------------------------*/
6836 
6837 float64 float128_to_float64(float128 a, float_status *status)
6838 {
6839     flag aSign;
6840     int32_t aExp;
6841     uint64_t aSig0, aSig1;
6842 
6843     aSig1 = extractFloat128Frac1( a );
6844     aSig0 = extractFloat128Frac0( a );
6845     aExp = extractFloat128Exp( a );
6846     aSign = extractFloat128Sign( a );
6847     if ( aExp == 0x7FFF ) {
6848         if ( aSig0 | aSig1 ) {
6849             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6850         }
6851         return packFloat64( aSign, 0x7FF, 0 );
6852     }
6853     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6854     aSig0 |= ( aSig1 != 0 );
6855     if ( aExp || aSig0 ) {
6856         aSig0 |= LIT64( 0x4000000000000000 );
6857         aExp -= 0x3C01;
6858     }
6859     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6860 
6861 }
6862 
6863 /*----------------------------------------------------------------------------
6864 | Returns the result of converting the quadruple-precision floating-point
6865 | value `a' to the extended double-precision floating-point format.  The
6866 | conversion is performed according to the IEC/IEEE Standard for Binary
6867 | Floating-Point Arithmetic.
6868 *----------------------------------------------------------------------------*/
6869 
6870 floatx80 float128_to_floatx80(float128 a, float_status *status)
6871 {
6872     flag aSign;
6873     int32_t aExp;
6874     uint64_t aSig0, aSig1;
6875 
6876     aSig1 = extractFloat128Frac1( a );
6877     aSig0 = extractFloat128Frac0( a );
6878     aExp = extractFloat128Exp( a );
6879     aSign = extractFloat128Sign( a );
6880     if ( aExp == 0x7FFF ) {
6881         if ( aSig0 | aSig1 ) {
6882             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6883         }
6884         return packFloatx80(aSign, floatx80_infinity_high,
6885                                    floatx80_infinity_low);
6886     }
6887     if ( aExp == 0 ) {
6888         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6889         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6890     }
6891     else {
6892         aSig0 |= LIT64( 0x0001000000000000 );
6893     }
6894     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6895     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6896 
6897 }
6898 
6899 /*----------------------------------------------------------------------------
6900 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6901 | returns the result as a quadruple-precision floating-point value.  The
6902 | operation is performed according to the IEC/IEEE Standard for Binary
6903 | Floating-Point Arithmetic.
6904 *----------------------------------------------------------------------------*/
6905 
6906 float128 float128_round_to_int(float128 a, float_status *status)
6907 {
6908     flag aSign;
6909     int32_t aExp;
6910     uint64_t lastBitMask, roundBitsMask;
6911     float128 z;
6912 
6913     aExp = extractFloat128Exp( a );
6914     if ( 0x402F <= aExp ) {
6915         if ( 0x406F <= aExp ) {
6916             if (    ( aExp == 0x7FFF )
6917                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6918                ) {
6919                 return propagateFloat128NaN(a, a, status);
6920             }
6921             return a;
6922         }
6923         lastBitMask = 1;
6924         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6925         roundBitsMask = lastBitMask - 1;
6926         z = a;
6927         switch (status->float_rounding_mode) {
6928         case float_round_nearest_even:
6929             if ( lastBitMask ) {
6930                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6931                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6932             }
6933             else {
6934                 if ( (int64_t) z.low < 0 ) {
6935                     ++z.high;
6936                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6937                 }
6938             }
6939             break;
6940         case float_round_ties_away:
6941             if (lastBitMask) {
6942                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6943             } else {
6944                 if ((int64_t) z.low < 0) {
6945                     ++z.high;
6946                 }
6947             }
6948             break;
6949         case float_round_to_zero:
6950             break;
6951         case float_round_up:
6952             if (!extractFloat128Sign(z)) {
6953                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6954             }
6955             break;
6956         case float_round_down:
6957             if (extractFloat128Sign(z)) {
6958                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6959             }
6960             break;
6961         default:
6962             abort();
6963         }
6964         z.low &= ~ roundBitsMask;
6965     }
6966     else {
6967         if ( aExp < 0x3FFF ) {
6968             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6969             status->float_exception_flags |= float_flag_inexact;
6970             aSign = extractFloat128Sign( a );
6971             switch (status->float_rounding_mode) {
6972              case float_round_nearest_even:
6973                 if (    ( aExp == 0x3FFE )
6974                      && (   extractFloat128Frac0( a )
6975                           | extractFloat128Frac1( a ) )
6976                    ) {
6977                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6978                 }
6979                 break;
6980             case float_round_ties_away:
6981                 if (aExp == 0x3FFE) {
6982                     return packFloat128(aSign, 0x3FFF, 0, 0);
6983                 }
6984                 break;
6985              case float_round_down:
6986                 return
6987                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6988                     : packFloat128( 0, 0, 0, 0 );
6989              case float_round_up:
6990                 return
6991                       aSign ? packFloat128( 1, 0, 0, 0 )
6992                     : packFloat128( 0, 0x3FFF, 0, 0 );
6993             }
6994             return packFloat128( aSign, 0, 0, 0 );
6995         }
6996         lastBitMask = 1;
6997         lastBitMask <<= 0x402F - aExp;
6998         roundBitsMask = lastBitMask - 1;
6999         z.low = 0;
7000         z.high = a.high;
7001         switch (status->float_rounding_mode) {
7002         case float_round_nearest_even:
7003             z.high += lastBitMask>>1;
7004             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7005                 z.high &= ~ lastBitMask;
7006             }
7007             break;
7008         case float_round_ties_away:
7009             z.high += lastBitMask>>1;
7010             break;
7011         case float_round_to_zero:
7012             break;
7013         case float_round_up:
7014             if (!extractFloat128Sign(z)) {
7015                 z.high |= ( a.low != 0 );
7016                 z.high += roundBitsMask;
7017             }
7018             break;
7019         case float_round_down:
7020             if (extractFloat128Sign(z)) {
7021                 z.high |= (a.low != 0);
7022                 z.high += roundBitsMask;
7023             }
7024             break;
7025         default:
7026             abort();
7027         }
7028         z.high &= ~ roundBitsMask;
7029     }
7030     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7031         status->float_exception_flags |= float_flag_inexact;
7032     }
7033     return z;
7034 
7035 }
7036 
7037 /*----------------------------------------------------------------------------
7038 | Returns the result of adding the absolute values of the quadruple-precision
7039 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7040 | before being returned.  `zSign' is ignored if the result is a NaN.
7041 | The addition is performed according to the IEC/IEEE Standard for Binary
7042 | Floating-Point Arithmetic.
7043 *----------------------------------------------------------------------------*/
7044 
7045 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7046                                 float_status *status)
7047 {
7048     int32_t aExp, bExp, zExp;
7049     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7050     int32_t expDiff;
7051 
7052     aSig1 = extractFloat128Frac1( a );
7053     aSig0 = extractFloat128Frac0( a );
7054     aExp = extractFloat128Exp( a );
7055     bSig1 = extractFloat128Frac1( b );
7056     bSig0 = extractFloat128Frac0( b );
7057     bExp = extractFloat128Exp( b );
7058     expDiff = aExp - bExp;
7059     if ( 0 < expDiff ) {
7060         if ( aExp == 0x7FFF ) {
7061             if (aSig0 | aSig1) {
7062                 return propagateFloat128NaN(a, b, status);
7063             }
7064             return a;
7065         }
7066         if ( bExp == 0 ) {
7067             --expDiff;
7068         }
7069         else {
7070             bSig0 |= LIT64( 0x0001000000000000 );
7071         }
7072         shift128ExtraRightJamming(
7073             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7074         zExp = aExp;
7075     }
7076     else if ( expDiff < 0 ) {
7077         if ( bExp == 0x7FFF ) {
7078             if (bSig0 | bSig1) {
7079                 return propagateFloat128NaN(a, b, status);
7080             }
7081             return packFloat128( zSign, 0x7FFF, 0, 0 );
7082         }
7083         if ( aExp == 0 ) {
7084             ++expDiff;
7085         }
7086         else {
7087             aSig0 |= LIT64( 0x0001000000000000 );
7088         }
7089         shift128ExtraRightJamming(
7090             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7091         zExp = bExp;
7092     }
7093     else {
7094         if ( aExp == 0x7FFF ) {
7095             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7096                 return propagateFloat128NaN(a, b, status);
7097             }
7098             return a;
7099         }
7100         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7101         if ( aExp == 0 ) {
7102             if (status->flush_to_zero) {
7103                 if (zSig0 | zSig1) {
7104                     float_raise(float_flag_output_denormal, status);
7105                 }
7106                 return packFloat128(zSign, 0, 0, 0);
7107             }
7108             return packFloat128( zSign, 0, zSig0, zSig1 );
7109         }
7110         zSig2 = 0;
7111         zSig0 |= LIT64( 0x0002000000000000 );
7112         zExp = aExp;
7113         goto shiftRight1;
7114     }
7115     aSig0 |= LIT64( 0x0001000000000000 );
7116     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7117     --zExp;
7118     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7119     ++zExp;
7120  shiftRight1:
7121     shift128ExtraRightJamming(
7122         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7123  roundAndPack:
7124     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7125 
7126 }
7127 
7128 /*----------------------------------------------------------------------------
7129 | Returns the result of subtracting the absolute values of the quadruple-
7130 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7131 | difference is negated before being returned.  `zSign' is ignored if the
7132 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7133 | Standard for Binary Floating-Point Arithmetic.
7134 *----------------------------------------------------------------------------*/
7135 
7136 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7137                                 float_status *status)
7138 {
7139     int32_t aExp, bExp, zExp;
7140     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7141     int32_t expDiff;
7142 
7143     aSig1 = extractFloat128Frac1( a );
7144     aSig0 = extractFloat128Frac0( a );
7145     aExp = extractFloat128Exp( a );
7146     bSig1 = extractFloat128Frac1( b );
7147     bSig0 = extractFloat128Frac0( b );
7148     bExp = extractFloat128Exp( b );
7149     expDiff = aExp - bExp;
7150     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7151     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7152     if ( 0 < expDiff ) goto aExpBigger;
7153     if ( expDiff < 0 ) goto bExpBigger;
7154     if ( aExp == 0x7FFF ) {
7155         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7156             return propagateFloat128NaN(a, b, status);
7157         }
7158         float_raise(float_flag_invalid, status);
7159         return float128_default_nan(status);
7160     }
7161     if ( aExp == 0 ) {
7162         aExp = 1;
7163         bExp = 1;
7164     }
7165     if ( bSig0 < aSig0 ) goto aBigger;
7166     if ( aSig0 < bSig0 ) goto bBigger;
7167     if ( bSig1 < aSig1 ) goto aBigger;
7168     if ( aSig1 < bSig1 ) goto bBigger;
7169     return packFloat128(status->float_rounding_mode == float_round_down,
7170                         0, 0, 0);
7171  bExpBigger:
7172     if ( bExp == 0x7FFF ) {
7173         if (bSig0 | bSig1) {
7174             return propagateFloat128NaN(a, b, status);
7175         }
7176         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7177     }
7178     if ( aExp == 0 ) {
7179         ++expDiff;
7180     }
7181     else {
7182         aSig0 |= LIT64( 0x4000000000000000 );
7183     }
7184     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7185     bSig0 |= LIT64( 0x4000000000000000 );
7186  bBigger:
7187     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7188     zExp = bExp;
7189     zSign ^= 1;
7190     goto normalizeRoundAndPack;
7191  aExpBigger:
7192     if ( aExp == 0x7FFF ) {
7193         if (aSig0 | aSig1) {
7194             return propagateFloat128NaN(a, b, status);
7195         }
7196         return a;
7197     }
7198     if ( bExp == 0 ) {
7199         --expDiff;
7200     }
7201     else {
7202         bSig0 |= LIT64( 0x4000000000000000 );
7203     }
7204     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7205     aSig0 |= LIT64( 0x4000000000000000 );
7206  aBigger:
7207     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7208     zExp = aExp;
7209  normalizeRoundAndPack:
7210     --zExp;
7211     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7212                                          status);
7213 
7214 }
7215 
7216 /*----------------------------------------------------------------------------
7217 | Returns the result of adding the quadruple-precision floating-point values
7218 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7219 | for Binary Floating-Point Arithmetic.
7220 *----------------------------------------------------------------------------*/
7221 
7222 float128 float128_add(float128 a, float128 b, float_status *status)
7223 {
7224     flag aSign, bSign;
7225 
7226     aSign = extractFloat128Sign( a );
7227     bSign = extractFloat128Sign( b );
7228     if ( aSign == bSign ) {
7229         return addFloat128Sigs(a, b, aSign, status);
7230     }
7231     else {
7232         return subFloat128Sigs(a, b, aSign, status);
7233     }
7234 
7235 }
7236 
7237 /*----------------------------------------------------------------------------
7238 | Returns the result of subtracting the quadruple-precision floating-point
7239 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7240 | Standard for Binary Floating-Point Arithmetic.
7241 *----------------------------------------------------------------------------*/
7242 
7243 float128 float128_sub(float128 a, float128 b, float_status *status)
7244 {
7245     flag aSign, bSign;
7246 
7247     aSign = extractFloat128Sign( a );
7248     bSign = extractFloat128Sign( b );
7249     if ( aSign == bSign ) {
7250         return subFloat128Sigs(a, b, aSign, status);
7251     }
7252     else {
7253         return addFloat128Sigs(a, b, aSign, status);
7254     }
7255 
7256 }
7257 
7258 /*----------------------------------------------------------------------------
7259 | Returns the result of multiplying the quadruple-precision floating-point
7260 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7261 | Standard for Binary Floating-Point Arithmetic.
7262 *----------------------------------------------------------------------------*/
7263 
7264 float128 float128_mul(float128 a, float128 b, float_status *status)
7265 {
7266     flag aSign, bSign, zSign;
7267     int32_t aExp, bExp, zExp;
7268     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7269 
7270     aSig1 = extractFloat128Frac1( a );
7271     aSig0 = extractFloat128Frac0( a );
7272     aExp = extractFloat128Exp( a );
7273     aSign = extractFloat128Sign( a );
7274     bSig1 = extractFloat128Frac1( b );
7275     bSig0 = extractFloat128Frac0( b );
7276     bExp = extractFloat128Exp( b );
7277     bSign = extractFloat128Sign( b );
7278     zSign = aSign ^ bSign;
7279     if ( aExp == 0x7FFF ) {
7280         if (    ( aSig0 | aSig1 )
7281              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7282             return propagateFloat128NaN(a, b, status);
7283         }
7284         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7285         return packFloat128( zSign, 0x7FFF, 0, 0 );
7286     }
7287     if ( bExp == 0x7FFF ) {
7288         if (bSig0 | bSig1) {
7289             return propagateFloat128NaN(a, b, status);
7290         }
7291         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7292  invalid:
7293             float_raise(float_flag_invalid, status);
7294             return float128_default_nan(status);
7295         }
7296         return packFloat128( zSign, 0x7FFF, 0, 0 );
7297     }
7298     if ( aExp == 0 ) {
7299         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7300         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7301     }
7302     if ( bExp == 0 ) {
7303         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7304         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7305     }
7306     zExp = aExp + bExp - 0x4000;
7307     aSig0 |= LIT64( 0x0001000000000000 );
7308     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7309     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7310     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7311     zSig2 |= ( zSig3 != 0 );
7312     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7313         shift128ExtraRightJamming(
7314             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7315         ++zExp;
7316     }
7317     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7318 
7319 }
7320 
7321 /*----------------------------------------------------------------------------
7322 | Returns the result of dividing the quadruple-precision floating-point value
7323 | `a' by the corresponding value `b'.  The operation is performed according to
7324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7325 *----------------------------------------------------------------------------*/
7326 
7327 float128 float128_div(float128 a, float128 b, float_status *status)
7328 {
7329     flag aSign, bSign, zSign;
7330     int32_t aExp, bExp, zExp;
7331     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7332     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7333 
7334     aSig1 = extractFloat128Frac1( a );
7335     aSig0 = extractFloat128Frac0( a );
7336     aExp = extractFloat128Exp( a );
7337     aSign = extractFloat128Sign( a );
7338     bSig1 = extractFloat128Frac1( b );
7339     bSig0 = extractFloat128Frac0( b );
7340     bExp = extractFloat128Exp( b );
7341     bSign = extractFloat128Sign( b );
7342     zSign = aSign ^ bSign;
7343     if ( aExp == 0x7FFF ) {
7344         if (aSig0 | aSig1) {
7345             return propagateFloat128NaN(a, b, status);
7346         }
7347         if ( bExp == 0x7FFF ) {
7348             if (bSig0 | bSig1) {
7349                 return propagateFloat128NaN(a, b, status);
7350             }
7351             goto invalid;
7352         }
7353         return packFloat128( zSign, 0x7FFF, 0, 0 );
7354     }
7355     if ( bExp == 0x7FFF ) {
7356         if (bSig0 | bSig1) {
7357             return propagateFloat128NaN(a, b, status);
7358         }
7359         return packFloat128( zSign, 0, 0, 0 );
7360     }
7361     if ( bExp == 0 ) {
7362         if ( ( bSig0 | bSig1 ) == 0 ) {
7363             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7364  invalid:
7365                 float_raise(float_flag_invalid, status);
7366                 return float128_default_nan(status);
7367             }
7368             float_raise(float_flag_divbyzero, status);
7369             return packFloat128( zSign, 0x7FFF, 0, 0 );
7370         }
7371         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7372     }
7373     if ( aExp == 0 ) {
7374         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7375         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7376     }
7377     zExp = aExp - bExp + 0x3FFD;
7378     shortShift128Left(
7379         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7380     shortShift128Left(
7381         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7382     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7383         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7384         ++zExp;
7385     }
7386     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7387     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7388     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7389     while ( (int64_t) rem0 < 0 ) {
7390         --zSig0;
7391         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7392     }
7393     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7394     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7395         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7396         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7397         while ( (int64_t) rem1 < 0 ) {
7398             --zSig1;
7399             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7400         }
7401         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7402     }
7403     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7404     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7405 
7406 }
7407 
7408 /*----------------------------------------------------------------------------
7409 | Returns the remainder of the quadruple-precision floating-point value `a'
7410 | with respect to the corresponding value `b'.  The operation is performed
7411 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7412 *----------------------------------------------------------------------------*/
7413 
7414 float128 float128_rem(float128 a, float128 b, float_status *status)
7415 {
7416     flag aSign, zSign;
7417     int32_t aExp, bExp, expDiff;
7418     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7419     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7420     int64_t sigMean0;
7421 
7422     aSig1 = extractFloat128Frac1( a );
7423     aSig0 = extractFloat128Frac0( a );
7424     aExp = extractFloat128Exp( a );
7425     aSign = extractFloat128Sign( a );
7426     bSig1 = extractFloat128Frac1( b );
7427     bSig0 = extractFloat128Frac0( b );
7428     bExp = extractFloat128Exp( b );
7429     if ( aExp == 0x7FFF ) {
7430         if (    ( aSig0 | aSig1 )
7431              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7432             return propagateFloat128NaN(a, b, status);
7433         }
7434         goto invalid;
7435     }
7436     if ( bExp == 0x7FFF ) {
7437         if (bSig0 | bSig1) {
7438             return propagateFloat128NaN(a, b, status);
7439         }
7440         return a;
7441     }
7442     if ( bExp == 0 ) {
7443         if ( ( bSig0 | bSig1 ) == 0 ) {
7444  invalid:
7445             float_raise(float_flag_invalid, status);
7446             return float128_default_nan(status);
7447         }
7448         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7449     }
7450     if ( aExp == 0 ) {
7451         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7452         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7453     }
7454     expDiff = aExp - bExp;
7455     if ( expDiff < -1 ) return a;
7456     shortShift128Left(
7457         aSig0 | LIT64( 0x0001000000000000 ),
7458         aSig1,
7459         15 - ( expDiff < 0 ),
7460         &aSig0,
7461         &aSig1
7462     );
7463     shortShift128Left(
7464         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7465     q = le128( bSig0, bSig1, aSig0, aSig1 );
7466     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7467     expDiff -= 64;
7468     while ( 0 < expDiff ) {
7469         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7470         q = ( 4 < q ) ? q - 4 : 0;
7471         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7472         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7473         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7474         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7475         expDiff -= 61;
7476     }
7477     if ( -64 < expDiff ) {
7478         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7479         q = ( 4 < q ) ? q - 4 : 0;
7480         q >>= - expDiff;
7481         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7482         expDiff += 52;
7483         if ( expDiff < 0 ) {
7484             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7485         }
7486         else {
7487             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7488         }
7489         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7490         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7491     }
7492     else {
7493         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7494         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7495     }
7496     do {
7497         alternateASig0 = aSig0;
7498         alternateASig1 = aSig1;
7499         ++q;
7500         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7501     } while ( 0 <= (int64_t) aSig0 );
7502     add128(
7503         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7504     if (    ( sigMean0 < 0 )
7505          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7506         aSig0 = alternateASig0;
7507         aSig1 = alternateASig1;
7508     }
7509     zSign = ( (int64_t) aSig0 < 0 );
7510     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7511     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7512                                          status);
7513 }
7514 
7515 /*----------------------------------------------------------------------------
7516 | Returns the square root of the quadruple-precision floating-point value `a'.
7517 | The operation is performed according to the IEC/IEEE Standard for Binary
7518 | Floating-Point Arithmetic.
7519 *----------------------------------------------------------------------------*/
7520 
7521 float128 float128_sqrt(float128 a, float_status *status)
7522 {
7523     flag aSign;
7524     int32_t aExp, zExp;
7525     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7526     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7527 
7528     aSig1 = extractFloat128Frac1( a );
7529     aSig0 = extractFloat128Frac0( a );
7530     aExp = extractFloat128Exp( a );
7531     aSign = extractFloat128Sign( a );
7532     if ( aExp == 0x7FFF ) {
7533         if (aSig0 | aSig1) {
7534             return propagateFloat128NaN(a, a, status);
7535         }
7536         if ( ! aSign ) return a;
7537         goto invalid;
7538     }
7539     if ( aSign ) {
7540         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7541  invalid:
7542         float_raise(float_flag_invalid, status);
7543         return float128_default_nan(status);
7544     }
7545     if ( aExp == 0 ) {
7546         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7547         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7548     }
7549     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7550     aSig0 |= LIT64( 0x0001000000000000 );
7551     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7552     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7553     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7554     doubleZSig0 = zSig0<<1;
7555     mul64To128( zSig0, zSig0, &term0, &term1 );
7556     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7557     while ( (int64_t) rem0 < 0 ) {
7558         --zSig0;
7559         doubleZSig0 -= 2;
7560         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7561     }
7562     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7563     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7564         if ( zSig1 == 0 ) zSig1 = 1;
7565         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7566         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7567         mul64To128( zSig1, zSig1, &term2, &term3 );
7568         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7569         while ( (int64_t) rem1 < 0 ) {
7570             --zSig1;
7571             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7572             term3 |= 1;
7573             term2 |= doubleZSig0;
7574             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7575         }
7576         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7577     }
7578     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7579     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7580 
7581 }
7582 
7583 /*----------------------------------------------------------------------------
7584 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7585 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7586 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7588 *----------------------------------------------------------------------------*/
7589 
7590 int float128_eq(float128 a, float128 b, float_status *status)
7591 {
7592 
7593     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7594               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7595          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7596               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7597        ) {
7598         float_raise(float_flag_invalid, status);
7599         return 0;
7600     }
7601     return
7602            ( a.low == b.low )
7603         && (    ( a.high == b.high )
7604              || (    ( a.low == 0 )
7605                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7606            );
7607 
7608 }
7609 
7610 /*----------------------------------------------------------------------------
7611 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7612 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7613 | exception is raised if either operand is a NaN.  The comparison is performed
7614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7615 *----------------------------------------------------------------------------*/
7616 
7617 int float128_le(float128 a, float128 b, float_status *status)
7618 {
7619     flag aSign, bSign;
7620 
7621     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7622               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7623          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7624               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7625        ) {
7626         float_raise(float_flag_invalid, status);
7627         return 0;
7628     }
7629     aSign = extractFloat128Sign( a );
7630     bSign = extractFloat128Sign( b );
7631     if ( aSign != bSign ) {
7632         return
7633                aSign
7634             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7635                  == 0 );
7636     }
7637     return
7638           aSign ? le128( b.high, b.low, a.high, a.low )
7639         : le128( a.high, a.low, b.high, b.low );
7640 
7641 }
7642 
7643 /*----------------------------------------------------------------------------
7644 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7645 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7646 | raised if either operand is a NaN.  The comparison is performed according
7647 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7648 *----------------------------------------------------------------------------*/
7649 
7650 int float128_lt(float128 a, float128 b, float_status *status)
7651 {
7652     flag aSign, bSign;
7653 
7654     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7655               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7656          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7657               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7658        ) {
7659         float_raise(float_flag_invalid, status);
7660         return 0;
7661     }
7662     aSign = extractFloat128Sign( a );
7663     bSign = extractFloat128Sign( b );
7664     if ( aSign != bSign ) {
7665         return
7666                aSign
7667             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7668                  != 0 );
7669     }
7670     return
7671           aSign ? lt128( b.high, b.low, a.high, a.low )
7672         : lt128( a.high, a.low, b.high, b.low );
7673 
7674 }
7675 
7676 /*----------------------------------------------------------------------------
7677 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7678 | be compared, and 0 otherwise.  The invalid exception is raised if either
7679 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7680 | Standard for Binary Floating-Point Arithmetic.
7681 *----------------------------------------------------------------------------*/
7682 
7683 int float128_unordered(float128 a, float128 b, float_status *status)
7684 {
7685     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7686               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7687          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7688               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7689        ) {
7690         float_raise(float_flag_invalid, status);
7691         return 1;
7692     }
7693     return 0;
7694 }
7695 
7696 /*----------------------------------------------------------------------------
7697 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7698 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7699 | exception.  The comparison is performed according to the IEC/IEEE Standard
7700 | for Binary Floating-Point Arithmetic.
7701 *----------------------------------------------------------------------------*/
7702 
7703 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7704 {
7705 
7706     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7707               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7708          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7709               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7710        ) {
7711         if (float128_is_signaling_nan(a, status)
7712          || float128_is_signaling_nan(b, status)) {
7713             float_raise(float_flag_invalid, status);
7714         }
7715         return 0;
7716     }
7717     return
7718            ( a.low == b.low )
7719         && (    ( a.high == b.high )
7720              || (    ( a.low == 0 )
7721                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7722            );
7723 
7724 }
7725 
7726 /*----------------------------------------------------------------------------
7727 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7728 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7729 | cause an exception.  Otherwise, the comparison is performed according to the
7730 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7731 *----------------------------------------------------------------------------*/
7732 
7733 int float128_le_quiet(float128 a, float128 b, float_status *status)
7734 {
7735     flag aSign, bSign;
7736 
7737     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7738               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7739          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7740               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7741        ) {
7742         if (float128_is_signaling_nan(a, status)
7743          || float128_is_signaling_nan(b, status)) {
7744             float_raise(float_flag_invalid, status);
7745         }
7746         return 0;
7747     }
7748     aSign = extractFloat128Sign( a );
7749     bSign = extractFloat128Sign( b );
7750     if ( aSign != bSign ) {
7751         return
7752                aSign
7753             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7754                  == 0 );
7755     }
7756     return
7757           aSign ? le128( b.high, b.low, a.high, a.low )
7758         : le128( a.high, a.low, b.high, b.low );
7759 
7760 }
7761 
7762 /*----------------------------------------------------------------------------
7763 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7764 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7765 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7766 | Standard for Binary Floating-Point Arithmetic.
7767 *----------------------------------------------------------------------------*/
7768 
7769 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7770 {
7771     flag aSign, bSign;
7772 
7773     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7774               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7775          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7776               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7777        ) {
7778         if (float128_is_signaling_nan(a, status)
7779          || float128_is_signaling_nan(b, status)) {
7780             float_raise(float_flag_invalid, status);
7781         }
7782         return 0;
7783     }
7784     aSign = extractFloat128Sign( a );
7785     bSign = extractFloat128Sign( b );
7786     if ( aSign != bSign ) {
7787         return
7788                aSign
7789             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7790                  != 0 );
7791     }
7792     return
7793           aSign ? lt128( b.high, b.low, a.high, a.low )
7794         : lt128( a.high, a.low, b.high, b.low );
7795 
7796 }
7797 
7798 /*----------------------------------------------------------------------------
7799 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7800 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7801 | comparison is performed according to the IEC/IEEE Standard for Binary
7802 | Floating-Point Arithmetic.
7803 *----------------------------------------------------------------------------*/
7804 
7805 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7806 {
7807     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7808               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7809          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7810               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7811        ) {
7812         if (float128_is_signaling_nan(a, status)
7813          || float128_is_signaling_nan(b, status)) {
7814             float_raise(float_flag_invalid, status);
7815         }
7816         return 1;
7817     }
7818     return 0;
7819 }
7820 
7821 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7822                                             int is_quiet, float_status *status)
7823 {
7824     flag aSign, bSign;
7825 
7826     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7827         float_raise(float_flag_invalid, status);
7828         return float_relation_unordered;
7829     }
7830     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7831           ( extractFloatx80Frac( a )<<1 ) ) ||
7832         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7833           ( extractFloatx80Frac( b )<<1 ) )) {
7834         if (!is_quiet ||
7835             floatx80_is_signaling_nan(a, status) ||
7836             floatx80_is_signaling_nan(b, status)) {
7837             float_raise(float_flag_invalid, status);
7838         }
7839         return float_relation_unordered;
7840     }
7841     aSign = extractFloatx80Sign( a );
7842     bSign = extractFloatx80Sign( b );
7843     if ( aSign != bSign ) {
7844 
7845         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7846              ( ( a.low | b.low ) == 0 ) ) {
7847             /* zero case */
7848             return float_relation_equal;
7849         } else {
7850             return 1 - (2 * aSign);
7851         }
7852     } else {
7853         if (a.low == b.low && a.high == b.high) {
7854             return float_relation_equal;
7855         } else {
7856             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7857         }
7858     }
7859 }
7860 
7861 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7862 {
7863     return floatx80_compare_internal(a, b, 0, status);
7864 }
7865 
7866 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7867 {
7868     return floatx80_compare_internal(a, b, 1, status);
7869 }
7870 
7871 static inline int float128_compare_internal(float128 a, float128 b,
7872                                             int is_quiet, float_status *status)
7873 {
7874     flag aSign, bSign;
7875 
7876     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7877           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7878         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7879           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7880         if (!is_quiet ||
7881             float128_is_signaling_nan(a, status) ||
7882             float128_is_signaling_nan(b, status)) {
7883             float_raise(float_flag_invalid, status);
7884         }
7885         return float_relation_unordered;
7886     }
7887     aSign = extractFloat128Sign( a );
7888     bSign = extractFloat128Sign( b );
7889     if ( aSign != bSign ) {
7890         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7891             /* zero case */
7892             return float_relation_equal;
7893         } else {
7894             return 1 - (2 * aSign);
7895         }
7896     } else {
7897         if (a.low == b.low && a.high == b.high) {
7898             return float_relation_equal;
7899         } else {
7900             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7901         }
7902     }
7903 }
7904 
7905 int float128_compare(float128 a, float128 b, float_status *status)
7906 {
7907     return float128_compare_internal(a, b, 0, status);
7908 }
7909 
7910 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7911 {
7912     return float128_compare_internal(a, b, 1, status);
7913 }
7914 
7915 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7916 {
7917     flag aSign;
7918     int32_t aExp;
7919     uint64_t aSig;
7920 
7921     if (floatx80_invalid_encoding(a)) {
7922         float_raise(float_flag_invalid, status);
7923         return floatx80_default_nan(status);
7924     }
7925     aSig = extractFloatx80Frac( a );
7926     aExp = extractFloatx80Exp( a );
7927     aSign = extractFloatx80Sign( a );
7928 
7929     if ( aExp == 0x7FFF ) {
7930         if ( aSig<<1 ) {
7931             return propagateFloatx80NaN(a, a, status);
7932         }
7933         return a;
7934     }
7935 
7936     if (aExp == 0) {
7937         if (aSig == 0) {
7938             return a;
7939         }
7940         aExp++;
7941     }
7942 
7943     if (n > 0x10000) {
7944         n = 0x10000;
7945     } else if (n < -0x10000) {
7946         n = -0x10000;
7947     }
7948 
7949     aExp += n;
7950     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7951                                          aSign, aExp, aSig, 0, status);
7952 }
7953 
7954 float128 float128_scalbn(float128 a, int n, float_status *status)
7955 {
7956     flag aSign;
7957     int32_t aExp;
7958     uint64_t aSig0, aSig1;
7959 
7960     aSig1 = extractFloat128Frac1( a );
7961     aSig0 = extractFloat128Frac0( a );
7962     aExp = extractFloat128Exp( a );
7963     aSign = extractFloat128Sign( a );
7964     if ( aExp == 0x7FFF ) {
7965         if ( aSig0 | aSig1 ) {
7966             return propagateFloat128NaN(a, a, status);
7967         }
7968         return a;
7969     }
7970     if (aExp != 0) {
7971         aSig0 |= LIT64( 0x0001000000000000 );
7972     } else if (aSig0 == 0 && aSig1 == 0) {
7973         return a;
7974     } else {
7975         aExp++;
7976     }
7977 
7978     if (n > 0x10000) {
7979         n = 0x10000;
7980     } else if (n < -0x10000) {
7981         n = -0x10000;
7982     }
7983 
7984     aExp += n - 1;
7985     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7986                                          , status);
7987 
7988 }
7989 
7990 static void __attribute__((constructor)) softfloat_init(void)
7991 {
7992     union_float64 ua, ub, uc, ur;
7993 
7994     if (QEMU_NO_HARDFLOAT) {
7995         return;
7996     }
7997     /*
7998      * Test that the host's FMA is not obviously broken. For example,
7999      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8000      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8001      */
8002     ua.s = 0x0020000000000001ULL;
8003     ub.s = 0x3ca0000000000000ULL;
8004     uc.s = 0x0020000000000000ULL;
8005     ur.h = fma(ua.h, ub.h, uc.h);
8006     if (ur.s != 0x0020000000000001ULL) {
8007         force_soft_fma = true;
8008     }
8009 }
8010