xref: /openbmc/qemu/fpu/softfloat.c (revision 9af2398977a78d37bf184d6ff6bd04c72bfbf006)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
606         s->float_exception_flags |= float_flag_invalid;
607     }
608 
609     if (s->default_nan_mode) {
610         a.cls = float_class_dnan;
611     } else {
612         switch (pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
613                               is_qnan(b.cls), is_snan(b.cls),
614                               is_qnan(c.cls), is_snan(c.cls),
615                               inf_zero, s)) {
616         case 0:
617             break;
618         case 1:
619             a = b;
620             break;
621         case 2:
622             a = c;
623             break;
624         case 3:
625             a.cls = float_class_dnan;
626             return a;
627         default:
628             g_assert_not_reached();
629         }
630 
631         a.cls = float_class_msnan;
632     }
633     return a;
634 }
635 
636 /*
637  * Returns the result of adding or subtracting the values of the
638  * floating-point values `a' and `b'. The operation is performed
639  * according to the IEC/IEEE Standard for Binary Floating-Point
640  * Arithmetic.
641  */
642 
643 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
644                                 float_status *s)
645 {
646     bool a_sign = a.sign;
647     bool b_sign = b.sign ^ subtract;
648 
649     if (a_sign != b_sign) {
650         /* Subtraction */
651 
652         if (a.cls == float_class_normal && b.cls == float_class_normal) {
653             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
654                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
655                 a.frac = a.frac - b.frac;
656             } else {
657                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
658                 a.frac = b.frac - a.frac;
659                 a.exp = b.exp;
660                 a_sign ^= 1;
661             }
662 
663             if (a.frac == 0) {
664                 a.cls = float_class_zero;
665                 a.sign = s->float_rounding_mode == float_round_down;
666             } else {
667                 int shift = clz64(a.frac) - 1;
668                 a.frac = a.frac << shift;
669                 a.exp = a.exp - shift;
670                 a.sign = a_sign;
671             }
672             return a;
673         }
674         if (is_nan(a.cls) || is_nan(b.cls)) {
675             return pick_nan(a, b, s);
676         }
677         if (a.cls == float_class_inf) {
678             if (b.cls == float_class_inf) {
679                 float_raise(float_flag_invalid, s);
680                 a.cls = float_class_dnan;
681             }
682             return a;
683         }
684         if (a.cls == float_class_zero && b.cls == float_class_zero) {
685             a.sign = s->float_rounding_mode == float_round_down;
686             return a;
687         }
688         if (a.cls == float_class_zero || b.cls == float_class_inf) {
689             b.sign = a_sign ^ 1;
690             return b;
691         }
692         if (b.cls == float_class_zero) {
693             return a;
694         }
695     } else {
696         /* Addition */
697         if (a.cls == float_class_normal && b.cls == float_class_normal) {
698             if (a.exp > b.exp) {
699                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
700             } else if (a.exp < b.exp) {
701                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
702                 a.exp = b.exp;
703             }
704             a.frac += b.frac;
705             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
706                 a.frac >>= 1;
707                 a.exp += 1;
708             }
709             return a;
710         }
711         if (is_nan(a.cls) || is_nan(b.cls)) {
712             return pick_nan(a, b, s);
713         }
714         if (a.cls == float_class_inf || b.cls == float_class_zero) {
715             return a;
716         }
717         if (b.cls == float_class_inf || a.cls == float_class_zero) {
718             b.sign = b_sign;
719             return b;
720         }
721     }
722     g_assert_not_reached();
723 }
724 
725 /*
726  * Returns the result of adding or subtracting the floating-point
727  * values `a' and `b'. The operation is performed according to the
728  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
729  */
730 
731 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
732                                               float_status *status)
733 {
734     FloatParts pa = float16_unpack_canonical(a, status);
735     FloatParts pb = float16_unpack_canonical(b, status);
736     FloatParts pr = addsub_floats(pa, pb, false, status);
737 
738     return float16_round_pack_canonical(pr, status);
739 }
740 
741 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
742                                              float_status *status)
743 {
744     FloatParts pa = float32_unpack_canonical(a, status);
745     FloatParts pb = float32_unpack_canonical(b, status);
746     FloatParts pr = addsub_floats(pa, pb, false, status);
747 
748     return float32_round_pack_canonical(pr, status);
749 }
750 
751 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
752                                              float_status *status)
753 {
754     FloatParts pa = float64_unpack_canonical(a, status);
755     FloatParts pb = float64_unpack_canonical(b, status);
756     FloatParts pr = addsub_floats(pa, pb, false, status);
757 
758     return float64_round_pack_canonical(pr, status);
759 }
760 
761 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
762                                              float_status *status)
763 {
764     FloatParts pa = float16_unpack_canonical(a, status);
765     FloatParts pb = float16_unpack_canonical(b, status);
766     FloatParts pr = addsub_floats(pa, pb, true, status);
767 
768     return float16_round_pack_canonical(pr, status);
769 }
770 
771 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
772                                              float_status *status)
773 {
774     FloatParts pa = float32_unpack_canonical(a, status);
775     FloatParts pb = float32_unpack_canonical(b, status);
776     FloatParts pr = addsub_floats(pa, pb, true, status);
777 
778     return float32_round_pack_canonical(pr, status);
779 }
780 
781 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
782                                              float_status *status)
783 {
784     FloatParts pa = float64_unpack_canonical(a, status);
785     FloatParts pb = float64_unpack_canonical(b, status);
786     FloatParts pr = addsub_floats(pa, pb, true, status);
787 
788     return float64_round_pack_canonical(pr, status);
789 }
790 
791 /*
792  * Returns the result of multiplying the floating-point values `a' and
793  * `b'. The operation is performed according to the IEC/IEEE Standard
794  * for Binary Floating-Point Arithmetic.
795  */
796 
797 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
798 {
799     bool sign = a.sign ^ b.sign;
800 
801     if (a.cls == float_class_normal && b.cls == float_class_normal) {
802         uint64_t hi, lo;
803         int exp = a.exp + b.exp;
804 
805         mul64To128(a.frac, b.frac, &hi, &lo);
806         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
807         if (lo & DECOMPOSED_OVERFLOW_BIT) {
808             shift64RightJamming(lo, 1, &lo);
809             exp += 1;
810         }
811 
812         /* Re-use a */
813         a.exp = exp;
814         a.sign = sign;
815         a.frac = lo;
816         return a;
817     }
818     /* handle all the NaN cases */
819     if (is_nan(a.cls) || is_nan(b.cls)) {
820         return pick_nan(a, b, s);
821     }
822     /* Inf * Zero == NaN */
823     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
824         (a.cls == float_class_zero && b.cls == float_class_inf)) {
825         s->float_exception_flags |= float_flag_invalid;
826         a.cls = float_class_dnan;
827         a.sign = sign;
828         return a;
829     }
830     /* Multiply by 0 or Inf */
831     if (a.cls == float_class_inf || a.cls == float_class_zero) {
832         a.sign = sign;
833         return a;
834     }
835     if (b.cls == float_class_inf || b.cls == float_class_zero) {
836         b.sign = sign;
837         return b;
838     }
839     g_assert_not_reached();
840 }
841 
842 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
843                                              float_status *status)
844 {
845     FloatParts pa = float16_unpack_canonical(a, status);
846     FloatParts pb = float16_unpack_canonical(b, status);
847     FloatParts pr = mul_floats(pa, pb, status);
848 
849     return float16_round_pack_canonical(pr, status);
850 }
851 
852 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
853                                              float_status *status)
854 {
855     FloatParts pa = float32_unpack_canonical(a, status);
856     FloatParts pb = float32_unpack_canonical(b, status);
857     FloatParts pr = mul_floats(pa, pb, status);
858 
859     return float32_round_pack_canonical(pr, status);
860 }
861 
862 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
863                                              float_status *status)
864 {
865     FloatParts pa = float64_unpack_canonical(a, status);
866     FloatParts pb = float64_unpack_canonical(b, status);
867     FloatParts pr = mul_floats(pa, pb, status);
868 
869     return float64_round_pack_canonical(pr, status);
870 }
871 
872 /*
873  * Returns the result of multiplying the floating-point values `a' and
874  * `b' then adding 'c', with no intermediate rounding step after the
875  * multiplication. The operation is performed according to the
876  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
877  * The flags argument allows the caller to select negation of the
878  * addend, the intermediate product, or the final result. (The
879  * difference between this and having the caller do a separate
880  * negation is that negating externally will flip the sign bit on
881  * NaNs.)
882  */
883 
884 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
885                                 int flags, float_status *s)
886 {
887     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
888                     ((1 << float_class_inf) | (1 << float_class_zero));
889     bool p_sign;
890     bool sign_flip = flags & float_muladd_negate_result;
891     FloatClass p_class;
892     uint64_t hi, lo;
893     int p_exp;
894 
895     /* It is implementation-defined whether the cases of (0,inf,qnan)
896      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
897      * they return if they do), so we have to hand this information
898      * off to the target-specific pick-a-NaN routine.
899      */
900     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
901         return pick_nan_muladd(a, b, c, inf_zero, s);
902     }
903 
904     if (inf_zero) {
905         s->float_exception_flags |= float_flag_invalid;
906         a.cls = float_class_dnan;
907         return a;
908     }
909 
910     if (flags & float_muladd_negate_c) {
911         c.sign ^= 1;
912     }
913 
914     p_sign = a.sign ^ b.sign;
915 
916     if (flags & float_muladd_negate_product) {
917         p_sign ^= 1;
918     }
919 
920     if (a.cls == float_class_inf || b.cls == float_class_inf) {
921         p_class = float_class_inf;
922     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
923         p_class = float_class_zero;
924     } else {
925         p_class = float_class_normal;
926     }
927 
928     if (c.cls == float_class_inf) {
929         if (p_class == float_class_inf && p_sign != c.sign) {
930             s->float_exception_flags |= float_flag_invalid;
931             a.cls = float_class_dnan;
932         } else {
933             a.cls = float_class_inf;
934             a.sign = c.sign ^ sign_flip;
935         }
936         return a;
937     }
938 
939     if (p_class == float_class_inf) {
940         a.cls = float_class_inf;
941         a.sign = p_sign ^ sign_flip;
942         return a;
943     }
944 
945     if (p_class == float_class_zero) {
946         if (c.cls == float_class_zero) {
947             if (p_sign != c.sign) {
948                 p_sign = s->float_rounding_mode == float_round_down;
949             }
950             c.sign = p_sign;
951         } else if (flags & float_muladd_halve_result) {
952             c.exp -= 1;
953         }
954         c.sign ^= sign_flip;
955         return c;
956     }
957 
958     /* a & b should be normals now... */
959     assert(a.cls == float_class_normal &&
960            b.cls == float_class_normal);
961 
962     p_exp = a.exp + b.exp;
963 
964     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
965      * result.
966      */
967     mul64To128(a.frac, b.frac, &hi, &lo);
968     /* binary point now at bit 124 */
969 
970     /* check for overflow */
971     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
972         shift128RightJamming(hi, lo, 1, &hi, &lo);
973         p_exp += 1;
974     }
975 
976     /* + add/sub */
977     if (c.cls == float_class_zero) {
978         /* move binary point back to 62 */
979         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
980     } else {
981         int exp_diff = p_exp - c.exp;
982         if (p_sign == c.sign) {
983             /* Addition */
984             if (exp_diff <= 0) {
985                 shift128RightJamming(hi, lo,
986                                      DECOMPOSED_BINARY_POINT - exp_diff,
987                                      &hi, &lo);
988                 lo += c.frac;
989                 p_exp = c.exp;
990             } else {
991                 uint64_t c_hi, c_lo;
992                 /* shift c to the same binary point as the product (124) */
993                 c_hi = c.frac >> 2;
994                 c_lo = 0;
995                 shift128RightJamming(c_hi, c_lo,
996                                      exp_diff,
997                                      &c_hi, &c_lo);
998                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
999                 /* move binary point back to 62 */
1000                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1001             }
1002 
1003             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1004                 shift64RightJamming(lo, 1, &lo);
1005                 p_exp += 1;
1006             }
1007 
1008         } else {
1009             /* Subtraction */
1010             uint64_t c_hi, c_lo;
1011             /* make C binary point match product at bit 124 */
1012             c_hi = c.frac >> 2;
1013             c_lo = 0;
1014 
1015             if (exp_diff <= 0) {
1016                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1017                 if (exp_diff == 0
1018                     &&
1019                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1020                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1021                 } else {
1022                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1023                     p_sign ^= 1;
1024                     p_exp = c.exp;
1025                 }
1026             } else {
1027                 shift128RightJamming(c_hi, c_lo,
1028                                      exp_diff,
1029                                      &c_hi, &c_lo);
1030                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1031             }
1032 
1033             if (hi == 0 && lo == 0) {
1034                 a.cls = float_class_zero;
1035                 a.sign = s->float_rounding_mode == float_round_down;
1036                 a.sign ^= sign_flip;
1037                 return a;
1038             } else {
1039                 int shift;
1040                 if (hi != 0) {
1041                     shift = clz64(hi);
1042                 } else {
1043                     shift = clz64(lo) + 64;
1044                 }
1045                 /* Normalizing to a binary point of 124 is the
1046                    correct adjust for the exponent.  However since we're
1047                    shifting, we might as well put the binary point back
1048                    at 62 where we really want it.  Therefore shift as
1049                    if we're leaving 1 bit at the top of the word, but
1050                    adjust the exponent as if we're leaving 3 bits.  */
1051                 shift -= 1;
1052                 if (shift >= 64) {
1053                     lo = lo << (shift - 64);
1054                 } else {
1055                     hi = (hi << shift) | (lo >> (64 - shift));
1056                     lo = hi | ((lo << shift) != 0);
1057                 }
1058                 p_exp -= shift - 2;
1059             }
1060         }
1061     }
1062 
1063     if (flags & float_muladd_halve_result) {
1064         p_exp -= 1;
1065     }
1066 
1067     /* finally prepare our result */
1068     a.cls = float_class_normal;
1069     a.sign = p_sign ^ sign_flip;
1070     a.exp = p_exp;
1071     a.frac = lo;
1072 
1073     return a;
1074 }
1075 
1076 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1077                                                 int flags, float_status *status)
1078 {
1079     FloatParts pa = float16_unpack_canonical(a, status);
1080     FloatParts pb = float16_unpack_canonical(b, status);
1081     FloatParts pc = float16_unpack_canonical(c, status);
1082     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1083 
1084     return float16_round_pack_canonical(pr, status);
1085 }
1086 
1087 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1088                                                 int flags, float_status *status)
1089 {
1090     FloatParts pa = float32_unpack_canonical(a, status);
1091     FloatParts pb = float32_unpack_canonical(b, status);
1092     FloatParts pc = float32_unpack_canonical(c, status);
1093     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1094 
1095     return float32_round_pack_canonical(pr, status);
1096 }
1097 
1098 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1099                                                 int flags, float_status *status)
1100 {
1101     FloatParts pa = float64_unpack_canonical(a, status);
1102     FloatParts pb = float64_unpack_canonical(b, status);
1103     FloatParts pc = float64_unpack_canonical(c, status);
1104     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1105 
1106     return float64_round_pack_canonical(pr, status);
1107 }
1108 
1109 /*
1110  * Returns the result of dividing the floating-point value `a' by the
1111  * corresponding value `b'. The operation is performed according to
1112  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1113  */
1114 
1115 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1116 {
1117     bool sign = a.sign ^ b.sign;
1118 
1119     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1120         uint64_t temp_lo, temp_hi;
1121         int exp = a.exp - b.exp;
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1125                               &temp_hi, &temp_lo);
1126         } else {
1127             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1128                               &temp_hi, &temp_lo);
1129         }
1130         /* LSB of quot is set if inexact which roundandpack will use
1131          * to set flags. Yet again we re-use a for the result */
1132         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1133         a.sign = sign;
1134         a.exp = exp;
1135         return a;
1136     }
1137     /* handle all the NaN cases */
1138     if (is_nan(a.cls) || is_nan(b.cls)) {
1139         return pick_nan(a, b, s);
1140     }
1141     /* 0/0 or Inf/Inf */
1142     if (a.cls == b.cls
1143         &&
1144         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1145         s->float_exception_flags |= float_flag_invalid;
1146         a.cls = float_class_dnan;
1147         return a;
1148     }
1149     /* Div 0 => Inf */
1150     if (b.cls == float_class_zero) {
1151         s->float_exception_flags |= float_flag_divbyzero;
1152         a.cls = float_class_inf;
1153         a.sign = sign;
1154         return a;
1155     }
1156     /* Inf / x or 0 / x */
1157     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1158         a.sign = sign;
1159         return a;
1160     }
1161     /* Div by Inf */
1162     if (b.cls == float_class_inf) {
1163         a.cls = float_class_zero;
1164         a.sign = sign;
1165         return a;
1166     }
1167     g_assert_not_reached();
1168 }
1169 
1170 float16 float16_div(float16 a, float16 b, float_status *status)
1171 {
1172     FloatParts pa = float16_unpack_canonical(a, status);
1173     FloatParts pb = float16_unpack_canonical(b, status);
1174     FloatParts pr = div_floats(pa, pb, status);
1175 
1176     return float16_round_pack_canonical(pr, status);
1177 }
1178 
1179 float32 float32_div(float32 a, float32 b, float_status *status)
1180 {
1181     FloatParts pa = float32_unpack_canonical(a, status);
1182     FloatParts pb = float32_unpack_canonical(b, status);
1183     FloatParts pr = div_floats(pa, pb, status);
1184 
1185     return float32_round_pack_canonical(pr, status);
1186 }
1187 
1188 float64 float64_div(float64 a, float64 b, float_status *status)
1189 {
1190     FloatParts pa = float64_unpack_canonical(a, status);
1191     FloatParts pb = float64_unpack_canonical(b, status);
1192     FloatParts pr = div_floats(pa, pb, status);
1193 
1194     return float64_round_pack_canonical(pr, status);
1195 }
1196 
1197 /*
1198  * Rounds the floating-point value `a' to an integer, and returns the
1199  * result as a floating-point value. The operation is performed
1200  * according to the IEC/IEEE Standard for Binary Floating-Point
1201  * Arithmetic.
1202  */
1203 
1204 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1205 {
1206     if (is_nan(a.cls)) {
1207         return return_nan(a, s);
1208     }
1209 
1210     switch (a.cls) {
1211     case float_class_zero:
1212     case float_class_inf:
1213     case float_class_qnan:
1214         /* already "integral" */
1215         break;
1216     case float_class_normal:
1217         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1218             /* already integral */
1219             break;
1220         }
1221         if (a.exp < 0) {
1222             bool one;
1223             /* all fractional */
1224             s->float_exception_flags |= float_flag_inexact;
1225             switch (rounding_mode) {
1226             case float_round_nearest_even:
1227                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1228                 break;
1229             case float_round_ties_away:
1230                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1231                 break;
1232             case float_round_to_zero:
1233                 one = false;
1234                 break;
1235             case float_round_up:
1236                 one = !a.sign;
1237                 break;
1238             case float_round_down:
1239                 one = a.sign;
1240                 break;
1241             default:
1242                 g_assert_not_reached();
1243             }
1244 
1245             if (one) {
1246                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1247                 a.exp = 0;
1248             } else {
1249                 a.cls = float_class_zero;
1250             }
1251         } else {
1252             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1253             uint64_t frac_lsbm1 = frac_lsb >> 1;
1254             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1255             uint64_t rnd_mask = rnd_even_mask >> 1;
1256             uint64_t inc;
1257 
1258             switch (rounding_mode) {
1259             case float_round_nearest_even:
1260                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1261                 break;
1262             case float_round_ties_away:
1263                 inc = frac_lsbm1;
1264                 break;
1265             case float_round_to_zero:
1266                 inc = 0;
1267                 break;
1268             case float_round_up:
1269                 inc = a.sign ? 0 : rnd_mask;
1270                 break;
1271             case float_round_down:
1272                 inc = a.sign ? rnd_mask : 0;
1273                 break;
1274             default:
1275                 g_assert_not_reached();
1276             }
1277 
1278             if (a.frac & rnd_mask) {
1279                 s->float_exception_flags |= float_flag_inexact;
1280                 a.frac += inc;
1281                 a.frac &= ~rnd_mask;
1282                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1283                     a.frac >>= 1;
1284                     a.exp++;
1285                 }
1286             }
1287         }
1288         break;
1289     default:
1290         g_assert_not_reached();
1291     }
1292     return a;
1293 }
1294 
1295 float16 float16_round_to_int(float16 a, float_status *s)
1296 {
1297     FloatParts pa = float16_unpack_canonical(a, s);
1298     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1299     return float16_round_pack_canonical(pr, s);
1300 }
1301 
1302 float32 float32_round_to_int(float32 a, float_status *s)
1303 {
1304     FloatParts pa = float32_unpack_canonical(a, s);
1305     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1306     return float32_round_pack_canonical(pr, s);
1307 }
1308 
1309 float64 float64_round_to_int(float64 a, float_status *s)
1310 {
1311     FloatParts pa = float64_unpack_canonical(a, s);
1312     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1313     return float64_round_pack_canonical(pr, s);
1314 }
1315 
1316 float64 float64_trunc_to_int(float64 a, float_status *s)
1317 {
1318     FloatParts pa = float64_unpack_canonical(a, s);
1319     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1320     return float64_round_pack_canonical(pr, s);
1321 }
1322 
1323 /*
1324  * Returns the result of converting the floating-point value `a' to
1325  * the two's complement integer format. The conversion is performed
1326  * according to the IEC/IEEE Standard for Binary Floating-Point
1327  * Arithmetic---which means in particular that the conversion is
1328  * rounded according to the current rounding mode. If `a' is a NaN,
1329  * the largest positive integer is returned. Otherwise, if the
1330  * conversion overflows, the largest integer with the same sign as `a'
1331  * is returned.
1332 */
1333 
1334 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1335                                      int64_t min, int64_t max,
1336                                      float_status *s)
1337 {
1338     uint64_t r;
1339     int orig_flags = get_float_exception_flags(s);
1340     FloatParts p = round_to_int(in, rmode, s);
1341 
1342     switch (p.cls) {
1343     case float_class_snan:
1344     case float_class_qnan:
1345         return max;
1346     case float_class_inf:
1347         return p.sign ? min : max;
1348     case float_class_zero:
1349         return 0;
1350     case float_class_normal:
1351         if (p.exp < DECOMPOSED_BINARY_POINT) {
1352             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1353         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1354             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1355         } else {
1356             r = UINT64_MAX;
1357         }
1358         if (p.sign) {
1359             if (r < -(uint64_t) min) {
1360                 return -r;
1361             } else {
1362                 s->float_exception_flags = orig_flags | float_flag_invalid;
1363                 return min;
1364             }
1365         } else {
1366             if (r < max) {
1367                 return r;
1368             } else {
1369                 s->float_exception_flags = orig_flags | float_flag_invalid;
1370                 return max;
1371             }
1372         }
1373     default:
1374         g_assert_not_reached();
1375     }
1376 }
1377 
1378 #define FLOAT_TO_INT(fsz, isz)                                          \
1379 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1380                                                 float_status *s)        \
1381 {                                                                       \
1382     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1383     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1384                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1385                                  s);                                    \
1386 }                                                                       \
1387                                                                         \
1388 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1389  (float ## fsz a, float_status *s)                                      \
1390 {                                                                       \
1391     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1392     return round_to_int_and_pack(p, float_round_to_zero,                \
1393                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1394                                  s);                                    \
1395 }
1396 
1397 FLOAT_TO_INT(16, 16)
1398 FLOAT_TO_INT(16, 32)
1399 FLOAT_TO_INT(16, 64)
1400 
1401 FLOAT_TO_INT(32, 16)
1402 FLOAT_TO_INT(32, 32)
1403 FLOAT_TO_INT(32, 64)
1404 
1405 FLOAT_TO_INT(64, 16)
1406 FLOAT_TO_INT(64, 32)
1407 FLOAT_TO_INT(64, 64)
1408 
1409 #undef FLOAT_TO_INT
1410 
1411 /*
1412  *  Returns the result of converting the floating-point value `a' to
1413  *  the unsigned integer format. The conversion is performed according
1414  *  to the IEC/IEEE Standard for Binary Floating-Point
1415  *  Arithmetic---which means in particular that the conversion is
1416  *  rounded according to the current rounding mode. If `a' is a NaN,
1417  *  the largest unsigned integer is returned. Otherwise, if the
1418  *  conversion overflows, the largest unsigned integer is returned. If
1419  *  the 'a' is negative, the result is rounded and zero is returned;
1420  *  values that do not round to zero will raise the inexact exception
1421  *  flag.
1422  */
1423 
1424 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1425                                        float_status *s)
1426 {
1427     int orig_flags = get_float_exception_flags(s);
1428     FloatParts p = round_to_int(in, rmode, s);
1429 
1430     switch (p.cls) {
1431     case float_class_snan:
1432     case float_class_qnan:
1433         s->float_exception_flags = orig_flags | float_flag_invalid;
1434         return max;
1435     case float_class_inf:
1436         return p.sign ? 0 : max;
1437     case float_class_zero:
1438         return 0;
1439     case float_class_normal:
1440     {
1441         uint64_t r;
1442         if (p.sign) {
1443             s->float_exception_flags = orig_flags | float_flag_invalid;
1444             return 0;
1445         }
1446 
1447         if (p.exp < DECOMPOSED_BINARY_POINT) {
1448             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1449         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1450             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1451         } else {
1452             s->float_exception_flags = orig_flags | float_flag_invalid;
1453             return max;
1454         }
1455 
1456         /* For uint64 this will never trip, but if p.exp is too large
1457          * to shift a decomposed fraction we shall have exited via the
1458          * 3rd leg above.
1459          */
1460         if (r > max) {
1461             s->float_exception_flags = orig_flags | float_flag_invalid;
1462             return max;
1463         } else {
1464             return r;
1465         }
1466     }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_UINT(fsz, isz) \
1473 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1474                                                   float_status *s)      \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1478                                  UINT ## isz ## _MAX, s);               \
1479 }                                                                       \
1480                                                                         \
1481 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1482  (float ## fsz a, float_status *s)                                      \
1483 {                                                                       \
1484     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1485     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1486                                  UINT ## isz ## _MAX, s);               \
1487 }
1488 
1489 FLOAT_TO_UINT(16, 16)
1490 FLOAT_TO_UINT(16, 32)
1491 FLOAT_TO_UINT(16, 64)
1492 
1493 FLOAT_TO_UINT(32, 16)
1494 FLOAT_TO_UINT(32, 32)
1495 FLOAT_TO_UINT(32, 64)
1496 
1497 FLOAT_TO_UINT(64, 16)
1498 FLOAT_TO_UINT(64, 32)
1499 FLOAT_TO_UINT(64, 64)
1500 
1501 #undef FLOAT_TO_UINT
1502 
1503 /*
1504  * Integer to float conversions
1505  *
1506  * Returns the result of converting the two's complement integer `a'
1507  * to the floating-point format. The conversion is performed according
1508  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1509  */
1510 
1511 static FloatParts int_to_float(int64_t a, float_status *status)
1512 {
1513     FloatParts r;
1514     if (a == 0) {
1515         r.cls = float_class_zero;
1516         r.sign = false;
1517     } else if (a == (1ULL << 63)) {
1518         r.cls = float_class_normal;
1519         r.sign = true;
1520         r.frac = DECOMPOSED_IMPLICIT_BIT;
1521         r.exp = 63;
1522     } else {
1523         uint64_t f;
1524         if (a < 0) {
1525             f = -a;
1526             r.sign = true;
1527         } else {
1528             f = a;
1529             r.sign = false;
1530         }
1531         int shift = clz64(f) - 1;
1532         r.cls = float_class_normal;
1533         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1534         r.frac = f << shift;
1535     }
1536 
1537     return r;
1538 }
1539 
1540 float16 int64_to_float16(int64_t a, float_status *status)
1541 {
1542     FloatParts pa = int_to_float(a, status);
1543     return float16_round_pack_canonical(pa, status);
1544 }
1545 
1546 float16 int32_to_float16(int32_t a, float_status *status)
1547 {
1548     return int64_to_float16(a, status);
1549 }
1550 
1551 float16 int16_to_float16(int16_t a, float_status *status)
1552 {
1553     return int64_to_float16(a, status);
1554 }
1555 
1556 float32 int64_to_float32(int64_t a, float_status *status)
1557 {
1558     FloatParts pa = int_to_float(a, status);
1559     return float32_round_pack_canonical(pa, status);
1560 }
1561 
1562 float32 int32_to_float32(int32_t a, float_status *status)
1563 {
1564     return int64_to_float32(a, status);
1565 }
1566 
1567 float32 int16_to_float32(int16_t a, float_status *status)
1568 {
1569     return int64_to_float32(a, status);
1570 }
1571 
1572 float64 int64_to_float64(int64_t a, float_status *status)
1573 {
1574     FloatParts pa = int_to_float(a, status);
1575     return float64_round_pack_canonical(pa, status);
1576 }
1577 
1578 float64 int32_to_float64(int32_t a, float_status *status)
1579 {
1580     return int64_to_float64(a, status);
1581 }
1582 
1583 float64 int16_to_float64(int16_t a, float_status *status)
1584 {
1585     return int64_to_float64(a, status);
1586 }
1587 
1588 
1589 /*
1590  * Unsigned Integer to float conversions
1591  *
1592  * Returns the result of converting the unsigned integer `a' to the
1593  * floating-point format. The conversion is performed according to the
1594  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1595  */
1596 
1597 static FloatParts uint_to_float(uint64_t a, float_status *status)
1598 {
1599     FloatParts r = { .sign = false};
1600 
1601     if (a == 0) {
1602         r.cls = float_class_zero;
1603     } else {
1604         int spare_bits = clz64(a) - 1;
1605         r.cls = float_class_normal;
1606         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1607         if (spare_bits < 0) {
1608             shift64RightJamming(a, -spare_bits, &a);
1609             r.frac = a;
1610         } else {
1611             r.frac = a << spare_bits;
1612         }
1613     }
1614 
1615     return r;
1616 }
1617 
1618 float16 uint64_to_float16(uint64_t a, float_status *status)
1619 {
1620     FloatParts pa = uint_to_float(a, status);
1621     return float16_round_pack_canonical(pa, status);
1622 }
1623 
1624 float16 uint32_to_float16(uint32_t a, float_status *status)
1625 {
1626     return uint64_to_float16(a, status);
1627 }
1628 
1629 float16 uint16_to_float16(uint16_t a, float_status *status)
1630 {
1631     return uint64_to_float16(a, status);
1632 }
1633 
1634 float32 uint64_to_float32(uint64_t a, float_status *status)
1635 {
1636     FloatParts pa = uint_to_float(a, status);
1637     return float32_round_pack_canonical(pa, status);
1638 }
1639 
1640 float32 uint32_to_float32(uint32_t a, float_status *status)
1641 {
1642     return uint64_to_float32(a, status);
1643 }
1644 
1645 float32 uint16_to_float32(uint16_t a, float_status *status)
1646 {
1647     return uint64_to_float32(a, status);
1648 }
1649 
1650 float64 uint64_to_float64(uint64_t a, float_status *status)
1651 {
1652     FloatParts pa = uint_to_float(a, status);
1653     return float64_round_pack_canonical(pa, status);
1654 }
1655 
1656 float64 uint32_to_float64(uint32_t a, float_status *status)
1657 {
1658     return uint64_to_float64(a, status);
1659 }
1660 
1661 float64 uint16_to_float64(uint16_t a, float_status *status)
1662 {
1663     return uint64_to_float64(a, status);
1664 }
1665 
1666 /* Float Min/Max */
1667 /* min() and max() functions. These can't be implemented as
1668  * 'compare and pick one input' because that would mishandle
1669  * NaNs and +0 vs -0.
1670  *
1671  * minnum() and maxnum() functions. These are similar to the min()
1672  * and max() functions but if one of the arguments is a QNaN and
1673  * the other is numerical then the numerical argument is returned.
1674  * SNaNs will get quietened before being returned.
1675  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1676  * and maxNum() operations. min() and max() are the typical min/max
1677  * semantics provided by many CPUs which predate that specification.
1678  *
1679  * minnummag() and maxnummag() functions correspond to minNumMag()
1680  * and minNumMag() from the IEEE-754 2008.
1681  */
1682 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1683                                 bool ieee, bool ismag, float_status *s)
1684 {
1685     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1686         if (ieee) {
1687             /* Takes two floating-point values `a' and `b', one of
1688              * which is a NaN, and returns the appropriate NaN
1689              * result. If either `a' or `b' is a signaling NaN,
1690              * the invalid exception is raised.
1691              */
1692             if (is_snan(a.cls) || is_snan(b.cls)) {
1693                 return pick_nan(a, b, s);
1694             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1695                 return b;
1696             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1697                 return a;
1698             }
1699         }
1700         return pick_nan(a, b, s);
1701     } else {
1702         int a_exp, b_exp;
1703         bool a_sign, b_sign;
1704 
1705         switch (a.cls) {
1706         case float_class_normal:
1707             a_exp = a.exp;
1708             break;
1709         case float_class_inf:
1710             a_exp = INT_MAX;
1711             break;
1712         case float_class_zero:
1713             a_exp = INT_MIN;
1714             break;
1715         default:
1716             g_assert_not_reached();
1717             break;
1718         }
1719         switch (b.cls) {
1720         case float_class_normal:
1721             b_exp = b.exp;
1722             break;
1723         case float_class_inf:
1724             b_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             b_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733 
1734         a_sign = a.sign;
1735         b_sign = b.sign;
1736         if (ismag) {
1737             a_sign = b_sign = 0;
1738         }
1739 
1740         if (a_sign == b_sign) {
1741             bool a_less = a_exp < b_exp;
1742             if (a_exp == b_exp) {
1743                 a_less = a.frac < b.frac;
1744             }
1745             return a_sign ^ a_less ^ ismin ? b : a;
1746         } else {
1747             return a_sign ^ ismin ? b : a;
1748         }
1749     }
1750 }
1751 
1752 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1753 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1754                                      float_status *s)                   \
1755 {                                                                       \
1756     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1757     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1758     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1759                                                                         \
1760     return float ## sz ## _round_pack_canonical(pr, s);                 \
1761 }
1762 
1763 MINMAX(16, min, true, false, false)
1764 MINMAX(16, minnum, true, true, false)
1765 MINMAX(16, minnummag, true, true, true)
1766 MINMAX(16, max, false, false, false)
1767 MINMAX(16, maxnum, false, true, false)
1768 MINMAX(16, maxnummag, false, true, true)
1769 
1770 MINMAX(32, min, true, false, false)
1771 MINMAX(32, minnum, true, true, false)
1772 MINMAX(32, minnummag, true, true, true)
1773 MINMAX(32, max, false, false, false)
1774 MINMAX(32, maxnum, false, true, false)
1775 MINMAX(32, maxnummag, false, true, true)
1776 
1777 MINMAX(64, min, true, false, false)
1778 MINMAX(64, minnum, true, true, false)
1779 MINMAX(64, minnummag, true, true, true)
1780 MINMAX(64, max, false, false, false)
1781 MINMAX(64, maxnum, false, true, false)
1782 MINMAX(64, maxnummag, false, true, true)
1783 
1784 #undef MINMAX
1785 
1786 /* Floating point compare */
1787 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1788                           float_status *s)
1789 {
1790     if (is_nan(a.cls) || is_nan(b.cls)) {
1791         if (!is_quiet ||
1792             a.cls == float_class_snan ||
1793             b.cls == float_class_snan) {
1794             s->float_exception_flags |= float_flag_invalid;
1795         }
1796         return float_relation_unordered;
1797     }
1798 
1799     if (a.cls == float_class_zero) {
1800         if (b.cls == float_class_zero) {
1801             return float_relation_equal;
1802         }
1803         return b.sign ? float_relation_greater : float_relation_less;
1804     } else if (b.cls == float_class_zero) {
1805         return a.sign ? float_relation_less : float_relation_greater;
1806     }
1807 
1808     /* The only really important thing about infinity is its sign. If
1809      * both are infinities the sign marks the smallest of the two.
1810      */
1811     if (a.cls == float_class_inf) {
1812         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1813             return float_relation_equal;
1814         }
1815         return a.sign ? float_relation_less : float_relation_greater;
1816     } else if (b.cls == float_class_inf) {
1817         return b.sign ? float_relation_greater : float_relation_less;
1818     }
1819 
1820     if (a.sign != b.sign) {
1821         return a.sign ? float_relation_less : float_relation_greater;
1822     }
1823 
1824     if (a.exp == b.exp) {
1825         if (a.frac == b.frac) {
1826             return float_relation_equal;
1827         }
1828         if (a.sign) {
1829             return a.frac > b.frac ?
1830                 float_relation_less : float_relation_greater;
1831         } else {
1832             return a.frac > b.frac ?
1833                 float_relation_greater : float_relation_less;
1834         }
1835     } else {
1836         if (a.sign) {
1837             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1838         } else {
1839             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1840         }
1841     }
1842 }
1843 
1844 #define COMPARE(sz)                                                     \
1845 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1846                             float_status *s)                            \
1847 {                                                                       \
1848     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1849     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1850     return compare_floats(pa, pb, false, s);                            \
1851 }                                                                       \
1852 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1853                                   float_status *s)                      \
1854 {                                                                       \
1855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1857     return compare_floats(pa, pb, true, s);                             \
1858 }
1859 
1860 COMPARE(16)
1861 COMPARE(32)
1862 COMPARE(64)
1863 
1864 #undef COMPARE
1865 
1866 /* Multiply A by 2 raised to the power N.  */
1867 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1868 {
1869     if (unlikely(is_nan(a.cls))) {
1870         return return_nan(a, s);
1871     }
1872     if (a.cls == float_class_normal) {
1873         a.exp += n;
1874     }
1875     return a;
1876 }
1877 
1878 float16 float16_scalbn(float16 a, int n, float_status *status)
1879 {
1880     FloatParts pa = float16_unpack_canonical(a, status);
1881     FloatParts pr = scalbn_decomposed(pa, n, status);
1882     return float16_round_pack_canonical(pr, status);
1883 }
1884 
1885 float32 float32_scalbn(float32 a, int n, float_status *status)
1886 {
1887     FloatParts pa = float32_unpack_canonical(a, status);
1888     FloatParts pr = scalbn_decomposed(pa, n, status);
1889     return float32_round_pack_canonical(pr, status);
1890 }
1891 
1892 float64 float64_scalbn(float64 a, int n, float_status *status)
1893 {
1894     FloatParts pa = float64_unpack_canonical(a, status);
1895     FloatParts pr = scalbn_decomposed(pa, n, status);
1896     return float64_round_pack_canonical(pr, status);
1897 }
1898 
1899 /*
1900  * Square Root
1901  *
1902  * The old softfloat code did an approximation step before zeroing in
1903  * on the final result. However for simpleness we just compute the
1904  * square root by iterating down from the implicit bit to enough extra
1905  * bits to ensure we get a correctly rounded result.
1906  *
1907  * This does mean however the calculation is slower than before,
1908  * especially for 64 bit floats.
1909  */
1910 
1911 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1912 {
1913     uint64_t a_frac, r_frac, s_frac;
1914     int bit, last_bit;
1915 
1916     if (is_nan(a.cls)) {
1917         return return_nan(a, s);
1918     }
1919     if (a.cls == float_class_zero) {
1920         return a;  /* sqrt(+-0) = +-0 */
1921     }
1922     if (a.sign) {
1923         s->float_exception_flags |= float_flag_invalid;
1924         a.cls = float_class_dnan;
1925         return a;
1926     }
1927     if (a.cls == float_class_inf) {
1928         return a;  /* sqrt(+inf) = +inf */
1929     }
1930 
1931     assert(a.cls == float_class_normal);
1932 
1933     /* We need two overflow bits at the top. Adding room for that is a
1934      * right shift. If the exponent is odd, we can discard the low bit
1935      * by multiplying the fraction by 2; that's a left shift. Combine
1936      * those and we shift right if the exponent is even.
1937      */
1938     a_frac = a.frac;
1939     if (!(a.exp & 1)) {
1940         a_frac >>= 1;
1941     }
1942     a.exp >>= 1;
1943 
1944     /* Bit-by-bit computation of sqrt.  */
1945     r_frac = 0;
1946     s_frac = 0;
1947 
1948     /* Iterate from implicit bit down to the 3 extra bits to compute a
1949      * properly rounded result. Remember we've inserted one more bit
1950      * at the top, so these positions are one less.
1951      */
1952     bit = DECOMPOSED_BINARY_POINT - 1;
1953     last_bit = MAX(p->frac_shift - 4, 0);
1954     do {
1955         uint64_t q = 1ULL << bit;
1956         uint64_t t_frac = s_frac + q;
1957         if (t_frac <= a_frac) {
1958             s_frac = t_frac + q;
1959             a_frac -= t_frac;
1960             r_frac += q;
1961         }
1962         a_frac <<= 1;
1963     } while (--bit >= last_bit);
1964 
1965     /* Undo the right shift done above. If there is any remaining
1966      * fraction, the result is inexact. Set the sticky bit.
1967      */
1968     a.frac = (r_frac << 1) + (a_frac != 0);
1969 
1970     return a;
1971 }
1972 
1973 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1974 {
1975     FloatParts pa = float16_unpack_canonical(a, status);
1976     FloatParts pr = sqrt_float(pa, status, &float16_params);
1977     return float16_round_pack_canonical(pr, status);
1978 }
1979 
1980 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
1981 {
1982     FloatParts pa = float32_unpack_canonical(a, status);
1983     FloatParts pr = sqrt_float(pa, status, &float32_params);
1984     return float32_round_pack_canonical(pr, status);
1985 }
1986 
1987 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
1988 {
1989     FloatParts pa = float64_unpack_canonical(a, status);
1990     FloatParts pr = sqrt_float(pa, status, &float64_params);
1991     return float64_round_pack_canonical(pr, status);
1992 }
1993 
1994 
1995 /*----------------------------------------------------------------------------
1996 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
1997 | and 7, and returns the properly rounded 32-bit integer corresponding to the
1998 | input.  If `zSign' is 1, the input is negated before being converted to an
1999 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2000 | is simply rounded to an integer, with the inexact exception raised if the
2001 | input cannot be represented exactly as an integer.  However, if the fixed-
2002 | point input is too large, the invalid exception is raised and the largest
2003 | positive or negative integer is returned.
2004 *----------------------------------------------------------------------------*/
2005 
2006 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2007 {
2008     int8_t roundingMode;
2009     flag roundNearestEven;
2010     int8_t roundIncrement, roundBits;
2011     int32_t z;
2012 
2013     roundingMode = status->float_rounding_mode;
2014     roundNearestEven = ( roundingMode == float_round_nearest_even );
2015     switch (roundingMode) {
2016     case float_round_nearest_even:
2017     case float_round_ties_away:
2018         roundIncrement = 0x40;
2019         break;
2020     case float_round_to_zero:
2021         roundIncrement = 0;
2022         break;
2023     case float_round_up:
2024         roundIncrement = zSign ? 0 : 0x7f;
2025         break;
2026     case float_round_down:
2027         roundIncrement = zSign ? 0x7f : 0;
2028         break;
2029     default:
2030         abort();
2031     }
2032     roundBits = absZ & 0x7F;
2033     absZ = ( absZ + roundIncrement )>>7;
2034     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2035     z = absZ;
2036     if ( zSign ) z = - z;
2037     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2038         float_raise(float_flag_invalid, status);
2039         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2040     }
2041     if (roundBits) {
2042         status->float_exception_flags |= float_flag_inexact;
2043     }
2044     return z;
2045 
2046 }
2047 
2048 /*----------------------------------------------------------------------------
2049 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2050 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2051 | and returns the properly rounded 64-bit integer corresponding to the input.
2052 | If `zSign' is 1, the input is negated before being converted to an integer.
2053 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2054 | the inexact exception raised if the input cannot be represented exactly as
2055 | an integer.  However, if the fixed-point input is too large, the invalid
2056 | exception is raised and the largest positive or negative integer is
2057 | returned.
2058 *----------------------------------------------------------------------------*/
2059 
2060 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2061                                float_status *status)
2062 {
2063     int8_t roundingMode;
2064     flag roundNearestEven, increment;
2065     int64_t z;
2066 
2067     roundingMode = status->float_rounding_mode;
2068     roundNearestEven = ( roundingMode == float_round_nearest_even );
2069     switch (roundingMode) {
2070     case float_round_nearest_even:
2071     case float_round_ties_away:
2072         increment = ((int64_t) absZ1 < 0);
2073         break;
2074     case float_round_to_zero:
2075         increment = 0;
2076         break;
2077     case float_round_up:
2078         increment = !zSign && absZ1;
2079         break;
2080     case float_round_down:
2081         increment = zSign && absZ1;
2082         break;
2083     default:
2084         abort();
2085     }
2086     if ( increment ) {
2087         ++absZ0;
2088         if ( absZ0 == 0 ) goto overflow;
2089         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2090     }
2091     z = absZ0;
2092     if ( zSign ) z = - z;
2093     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2094  overflow:
2095         float_raise(float_flag_invalid, status);
2096         return
2097               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2098             : LIT64( 0x7FFFFFFFFFFFFFFF );
2099     }
2100     if (absZ1) {
2101         status->float_exception_flags |= float_flag_inexact;
2102     }
2103     return z;
2104 
2105 }
2106 
2107 /*----------------------------------------------------------------------------
2108 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2109 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2110 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2111 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2112 | with the inexact exception raised if the input cannot be represented exactly
2113 | as an integer.  However, if the fixed-point input is too large, the invalid
2114 | exception is raised and the largest unsigned integer is returned.
2115 *----------------------------------------------------------------------------*/
2116 
2117 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2118                                 uint64_t absZ1, float_status *status)
2119 {
2120     int8_t roundingMode;
2121     flag roundNearestEven, increment;
2122 
2123     roundingMode = status->float_rounding_mode;
2124     roundNearestEven = (roundingMode == float_round_nearest_even);
2125     switch (roundingMode) {
2126     case float_round_nearest_even:
2127     case float_round_ties_away:
2128         increment = ((int64_t)absZ1 < 0);
2129         break;
2130     case float_round_to_zero:
2131         increment = 0;
2132         break;
2133     case float_round_up:
2134         increment = !zSign && absZ1;
2135         break;
2136     case float_round_down:
2137         increment = zSign && absZ1;
2138         break;
2139     default:
2140         abort();
2141     }
2142     if (increment) {
2143         ++absZ0;
2144         if (absZ0 == 0) {
2145             float_raise(float_flag_invalid, status);
2146             return LIT64(0xFFFFFFFFFFFFFFFF);
2147         }
2148         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2149     }
2150 
2151     if (zSign && absZ0) {
2152         float_raise(float_flag_invalid, status);
2153         return 0;
2154     }
2155 
2156     if (absZ1) {
2157         status->float_exception_flags |= float_flag_inexact;
2158     }
2159     return absZ0;
2160 }
2161 
2162 /*----------------------------------------------------------------------------
2163 | If `a' is denormal and we are in flush-to-zero mode then set the
2164 | input-denormal exception and return zero. Otherwise just return the value.
2165 *----------------------------------------------------------------------------*/
2166 float32 float32_squash_input_denormal(float32 a, float_status *status)
2167 {
2168     if (status->flush_inputs_to_zero) {
2169         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2170             float_raise(float_flag_input_denormal, status);
2171             return make_float32(float32_val(a) & 0x80000000);
2172         }
2173     }
2174     return a;
2175 }
2176 
2177 /*----------------------------------------------------------------------------
2178 | Normalizes the subnormal single-precision floating-point value represented
2179 | by the denormalized significand `aSig'.  The normalized exponent and
2180 | significand are stored at the locations pointed to by `zExpPtr' and
2181 | `zSigPtr', respectively.
2182 *----------------------------------------------------------------------------*/
2183 
2184 static void
2185  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2186 {
2187     int8_t shiftCount;
2188 
2189     shiftCount = countLeadingZeros32( aSig ) - 8;
2190     *zSigPtr = aSig<<shiftCount;
2191     *zExpPtr = 1 - shiftCount;
2192 
2193 }
2194 
2195 /*----------------------------------------------------------------------------
2196 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2197 | single-precision floating-point value, returning the result.  After being
2198 | shifted into the proper positions, the three fields are simply added
2199 | together to form the result.  This means that any integer portion of `zSig'
2200 | will be added into the exponent.  Since a properly normalized significand
2201 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2202 | than the desired result exponent whenever `zSig' is a complete, normalized
2203 | significand.
2204 *----------------------------------------------------------------------------*/
2205 
2206 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
2207 {
2208 
2209     return make_float32(
2210           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
2211 
2212 }
2213 
2214 /*----------------------------------------------------------------------------
2215 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2216 | and significand `zSig', and returns the proper single-precision floating-
2217 | point value corresponding to the abstract input.  Ordinarily, the abstract
2218 | value is simply rounded and packed into the single-precision format, with
2219 | the inexact exception raised if the abstract input cannot be represented
2220 | exactly.  However, if the abstract value is too large, the overflow and
2221 | inexact exceptions are raised and an infinity or maximal finite value is
2222 | returned.  If the abstract value is too small, the input value is rounded to
2223 | a subnormal number, and the underflow and inexact exceptions are raised if
2224 | the abstract input cannot be represented exactly as a subnormal single-
2225 | precision floating-point number.
2226 |     The input significand `zSig' has its binary point between bits 30
2227 | and 29, which is 7 bits to the left of the usual location.  This shifted
2228 | significand must be normalized or smaller.  If `zSig' is not normalized,
2229 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2230 | and it must not require rounding.  In the usual case that `zSig' is
2231 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2232 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2233 | Binary Floating-Point Arithmetic.
2234 *----------------------------------------------------------------------------*/
2235 
2236 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2237                                    float_status *status)
2238 {
2239     int8_t roundingMode;
2240     flag roundNearestEven;
2241     int8_t roundIncrement, roundBits;
2242     flag isTiny;
2243 
2244     roundingMode = status->float_rounding_mode;
2245     roundNearestEven = ( roundingMode == float_round_nearest_even );
2246     switch (roundingMode) {
2247     case float_round_nearest_even:
2248     case float_round_ties_away:
2249         roundIncrement = 0x40;
2250         break;
2251     case float_round_to_zero:
2252         roundIncrement = 0;
2253         break;
2254     case float_round_up:
2255         roundIncrement = zSign ? 0 : 0x7f;
2256         break;
2257     case float_round_down:
2258         roundIncrement = zSign ? 0x7f : 0;
2259         break;
2260     default:
2261         abort();
2262         break;
2263     }
2264     roundBits = zSig & 0x7F;
2265     if ( 0xFD <= (uint16_t) zExp ) {
2266         if (    ( 0xFD < zExp )
2267              || (    ( zExp == 0xFD )
2268                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2269            ) {
2270             float_raise(float_flag_overflow | float_flag_inexact, status);
2271             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2272         }
2273         if ( zExp < 0 ) {
2274             if (status->flush_to_zero) {
2275                 float_raise(float_flag_output_denormal, status);
2276                 return packFloat32(zSign, 0, 0);
2277             }
2278             isTiny =
2279                 (status->float_detect_tininess
2280                  == float_tininess_before_rounding)
2281                 || ( zExp < -1 )
2282                 || ( zSig + roundIncrement < 0x80000000 );
2283             shift32RightJamming( zSig, - zExp, &zSig );
2284             zExp = 0;
2285             roundBits = zSig & 0x7F;
2286             if (isTiny && roundBits) {
2287                 float_raise(float_flag_underflow, status);
2288             }
2289         }
2290     }
2291     if (roundBits) {
2292         status->float_exception_flags |= float_flag_inexact;
2293     }
2294     zSig = ( zSig + roundIncrement )>>7;
2295     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2296     if ( zSig == 0 ) zExp = 0;
2297     return packFloat32( zSign, zExp, zSig );
2298 
2299 }
2300 
2301 /*----------------------------------------------------------------------------
2302 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2303 | and significand `zSig', and returns the proper single-precision floating-
2304 | point value corresponding to the abstract input.  This routine is just like
2305 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2306 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2307 | floating-point exponent.
2308 *----------------------------------------------------------------------------*/
2309 
2310 static float32
2311  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2312                               float_status *status)
2313 {
2314     int8_t shiftCount;
2315 
2316     shiftCount = countLeadingZeros32( zSig ) - 1;
2317     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2318                                status);
2319 
2320 }
2321 
2322 /*----------------------------------------------------------------------------
2323 | If `a' is denormal and we are in flush-to-zero mode then set the
2324 | input-denormal exception and return zero. Otherwise just return the value.
2325 *----------------------------------------------------------------------------*/
2326 float64 float64_squash_input_denormal(float64 a, float_status *status)
2327 {
2328     if (status->flush_inputs_to_zero) {
2329         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2330             float_raise(float_flag_input_denormal, status);
2331             return make_float64(float64_val(a) & (1ULL << 63));
2332         }
2333     }
2334     return a;
2335 }
2336 
2337 /*----------------------------------------------------------------------------
2338 | Normalizes the subnormal double-precision floating-point value represented
2339 | by the denormalized significand `aSig'.  The normalized exponent and
2340 | significand are stored at the locations pointed to by `zExpPtr' and
2341 | `zSigPtr', respectively.
2342 *----------------------------------------------------------------------------*/
2343 
2344 static void
2345  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2346 {
2347     int8_t shiftCount;
2348 
2349     shiftCount = countLeadingZeros64( aSig ) - 11;
2350     *zSigPtr = aSig<<shiftCount;
2351     *zExpPtr = 1 - shiftCount;
2352 
2353 }
2354 
2355 /*----------------------------------------------------------------------------
2356 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2357 | double-precision floating-point value, returning the result.  After being
2358 | shifted into the proper positions, the three fields are simply added
2359 | together to form the result.  This means that any integer portion of `zSig'
2360 | will be added into the exponent.  Since a properly normalized significand
2361 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2362 | than the desired result exponent whenever `zSig' is a complete, normalized
2363 | significand.
2364 *----------------------------------------------------------------------------*/
2365 
2366 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2367 {
2368 
2369     return make_float64(
2370         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2371 
2372 }
2373 
2374 /*----------------------------------------------------------------------------
2375 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2376 | and significand `zSig', and returns the proper double-precision floating-
2377 | point value corresponding to the abstract input.  Ordinarily, the abstract
2378 | value is simply rounded and packed into the double-precision format, with
2379 | the inexact exception raised if the abstract input cannot be represented
2380 | exactly.  However, if the abstract value is too large, the overflow and
2381 | inexact exceptions are raised and an infinity or maximal finite value is
2382 | returned.  If the abstract value is too small, the input value is rounded to
2383 | a subnormal number, and the underflow and inexact exceptions are raised if
2384 | the abstract input cannot be represented exactly as a subnormal double-
2385 | precision floating-point number.
2386 |     The input significand `zSig' has its binary point between bits 62
2387 | and 61, which is 10 bits to the left of the usual location.  This shifted
2388 | significand must be normalized or smaller.  If `zSig' is not normalized,
2389 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2390 | and it must not require rounding.  In the usual case that `zSig' is
2391 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2392 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2393 | Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2395 
2396 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2397                                    float_status *status)
2398 {
2399     int8_t roundingMode;
2400     flag roundNearestEven;
2401     int roundIncrement, roundBits;
2402     flag isTiny;
2403 
2404     roundingMode = status->float_rounding_mode;
2405     roundNearestEven = ( roundingMode == float_round_nearest_even );
2406     switch (roundingMode) {
2407     case float_round_nearest_even:
2408     case float_round_ties_away:
2409         roundIncrement = 0x200;
2410         break;
2411     case float_round_to_zero:
2412         roundIncrement = 0;
2413         break;
2414     case float_round_up:
2415         roundIncrement = zSign ? 0 : 0x3ff;
2416         break;
2417     case float_round_down:
2418         roundIncrement = zSign ? 0x3ff : 0;
2419         break;
2420     case float_round_to_odd:
2421         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2422         break;
2423     default:
2424         abort();
2425     }
2426     roundBits = zSig & 0x3FF;
2427     if ( 0x7FD <= (uint16_t) zExp ) {
2428         if (    ( 0x7FD < zExp )
2429              || (    ( zExp == 0x7FD )
2430                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2431            ) {
2432             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2433                                    roundIncrement != 0;
2434             float_raise(float_flag_overflow | float_flag_inexact, status);
2435             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2436         }
2437         if ( zExp < 0 ) {
2438             if (status->flush_to_zero) {
2439                 float_raise(float_flag_output_denormal, status);
2440                 return packFloat64(zSign, 0, 0);
2441             }
2442             isTiny =
2443                    (status->float_detect_tininess
2444                     == float_tininess_before_rounding)
2445                 || ( zExp < -1 )
2446                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2447             shift64RightJamming( zSig, - zExp, &zSig );
2448             zExp = 0;
2449             roundBits = zSig & 0x3FF;
2450             if (isTiny && roundBits) {
2451                 float_raise(float_flag_underflow, status);
2452             }
2453             if (roundingMode == float_round_to_odd) {
2454                 /*
2455                  * For round-to-odd case, the roundIncrement depends on
2456                  * zSig which just changed.
2457                  */
2458                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2459             }
2460         }
2461     }
2462     if (roundBits) {
2463         status->float_exception_flags |= float_flag_inexact;
2464     }
2465     zSig = ( zSig + roundIncrement )>>10;
2466     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2467     if ( zSig == 0 ) zExp = 0;
2468     return packFloat64( zSign, zExp, zSig );
2469 
2470 }
2471 
2472 /*----------------------------------------------------------------------------
2473 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2474 | and significand `zSig', and returns the proper double-precision floating-
2475 | point value corresponding to the abstract input.  This routine is just like
2476 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2477 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2478 | floating-point exponent.
2479 *----------------------------------------------------------------------------*/
2480 
2481 static float64
2482  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2483                               float_status *status)
2484 {
2485     int8_t shiftCount;
2486 
2487     shiftCount = countLeadingZeros64( zSig ) - 1;
2488     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2489                                status);
2490 
2491 }
2492 
2493 /*----------------------------------------------------------------------------
2494 | Returns the fraction bits of the extended double-precision floating-point
2495 | value `a'.
2496 *----------------------------------------------------------------------------*/
2497 
2498 static inline uint64_t extractFloatx80Frac( floatx80 a )
2499 {
2500 
2501     return a.low;
2502 
2503 }
2504 
2505 /*----------------------------------------------------------------------------
2506 | Returns the exponent bits of the extended double-precision floating-point
2507 | value `a'.
2508 *----------------------------------------------------------------------------*/
2509 
2510 static inline int32_t extractFloatx80Exp( floatx80 a )
2511 {
2512 
2513     return a.high & 0x7FFF;
2514 
2515 }
2516 
2517 /*----------------------------------------------------------------------------
2518 | Returns the sign bit of the extended double-precision floating-point value
2519 | `a'.
2520 *----------------------------------------------------------------------------*/
2521 
2522 static inline flag extractFloatx80Sign( floatx80 a )
2523 {
2524 
2525     return a.high>>15;
2526 
2527 }
2528 
2529 /*----------------------------------------------------------------------------
2530 | Normalizes the subnormal extended double-precision floating-point value
2531 | represented by the denormalized significand `aSig'.  The normalized exponent
2532 | and significand are stored at the locations pointed to by `zExpPtr' and
2533 | `zSigPtr', respectively.
2534 *----------------------------------------------------------------------------*/
2535 
2536 static void
2537  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
2538 {
2539     int8_t shiftCount;
2540 
2541     shiftCount = countLeadingZeros64( aSig );
2542     *zSigPtr = aSig<<shiftCount;
2543     *zExpPtr = 1 - shiftCount;
2544 
2545 }
2546 
2547 /*----------------------------------------------------------------------------
2548 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
2549 | extended double-precision floating-point value, returning the result.
2550 *----------------------------------------------------------------------------*/
2551 
2552 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
2553 {
2554     floatx80 z;
2555 
2556     z.low = zSig;
2557     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
2558     return z;
2559 
2560 }
2561 
2562 /*----------------------------------------------------------------------------
2563 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2564 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2565 | and returns the proper extended double-precision floating-point value
2566 | corresponding to the abstract input.  Ordinarily, the abstract value is
2567 | rounded and packed into the extended double-precision format, with the
2568 | inexact exception raised if the abstract input cannot be represented
2569 | exactly.  However, if the abstract value is too large, the overflow and
2570 | inexact exceptions are raised and an infinity or maximal finite value is
2571 | returned.  If the abstract value is too small, the input value is rounded to
2572 | a subnormal number, and the underflow and inexact exceptions are raised if
2573 | the abstract input cannot be represented exactly as a subnormal extended
2574 | double-precision floating-point number.
2575 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2576 | number of bits as single or double precision, respectively.  Otherwise, the
2577 | result is rounded to the full precision of the extended double-precision
2578 | format.
2579 |     The input significand must be normalized or smaller.  If the input
2580 | significand is not normalized, `zExp' must be 0; in that case, the result
2581 | returned is a subnormal number, and it must not require rounding.  The
2582 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2583 | Floating-Point Arithmetic.
2584 *----------------------------------------------------------------------------*/
2585 
2586 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2587                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2588                                      float_status *status)
2589 {
2590     int8_t roundingMode;
2591     flag roundNearestEven, increment, isTiny;
2592     int64_t roundIncrement, roundMask, roundBits;
2593 
2594     roundingMode = status->float_rounding_mode;
2595     roundNearestEven = ( roundingMode == float_round_nearest_even );
2596     if ( roundingPrecision == 80 ) goto precision80;
2597     if ( roundingPrecision == 64 ) {
2598         roundIncrement = LIT64( 0x0000000000000400 );
2599         roundMask = LIT64( 0x00000000000007FF );
2600     }
2601     else if ( roundingPrecision == 32 ) {
2602         roundIncrement = LIT64( 0x0000008000000000 );
2603         roundMask = LIT64( 0x000000FFFFFFFFFF );
2604     }
2605     else {
2606         goto precision80;
2607     }
2608     zSig0 |= ( zSig1 != 0 );
2609     switch (roundingMode) {
2610     case float_round_nearest_even:
2611     case float_round_ties_away:
2612         break;
2613     case float_round_to_zero:
2614         roundIncrement = 0;
2615         break;
2616     case float_round_up:
2617         roundIncrement = zSign ? 0 : roundMask;
2618         break;
2619     case float_round_down:
2620         roundIncrement = zSign ? roundMask : 0;
2621         break;
2622     default:
2623         abort();
2624     }
2625     roundBits = zSig0 & roundMask;
2626     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2627         if (    ( 0x7FFE < zExp )
2628              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2629            ) {
2630             goto overflow;
2631         }
2632         if ( zExp <= 0 ) {
2633             if (status->flush_to_zero) {
2634                 float_raise(float_flag_output_denormal, status);
2635                 return packFloatx80(zSign, 0, 0);
2636             }
2637             isTiny =
2638                    (status->float_detect_tininess
2639                     == float_tininess_before_rounding)
2640                 || ( zExp < 0 )
2641                 || ( zSig0 <= zSig0 + roundIncrement );
2642             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2643             zExp = 0;
2644             roundBits = zSig0 & roundMask;
2645             if (isTiny && roundBits) {
2646                 float_raise(float_flag_underflow, status);
2647             }
2648             if (roundBits) {
2649                 status->float_exception_flags |= float_flag_inexact;
2650             }
2651             zSig0 += roundIncrement;
2652             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2653             roundIncrement = roundMask + 1;
2654             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2655                 roundMask |= roundIncrement;
2656             }
2657             zSig0 &= ~ roundMask;
2658             return packFloatx80( zSign, zExp, zSig0 );
2659         }
2660     }
2661     if (roundBits) {
2662         status->float_exception_flags |= float_flag_inexact;
2663     }
2664     zSig0 += roundIncrement;
2665     if ( zSig0 < roundIncrement ) {
2666         ++zExp;
2667         zSig0 = LIT64( 0x8000000000000000 );
2668     }
2669     roundIncrement = roundMask + 1;
2670     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2671         roundMask |= roundIncrement;
2672     }
2673     zSig0 &= ~ roundMask;
2674     if ( zSig0 == 0 ) zExp = 0;
2675     return packFloatx80( zSign, zExp, zSig0 );
2676  precision80:
2677     switch (roundingMode) {
2678     case float_round_nearest_even:
2679     case float_round_ties_away:
2680         increment = ((int64_t)zSig1 < 0);
2681         break;
2682     case float_round_to_zero:
2683         increment = 0;
2684         break;
2685     case float_round_up:
2686         increment = !zSign && zSig1;
2687         break;
2688     case float_round_down:
2689         increment = zSign && zSig1;
2690         break;
2691     default:
2692         abort();
2693     }
2694     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2695         if (    ( 0x7FFE < zExp )
2696              || (    ( zExp == 0x7FFE )
2697                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2698                   && increment
2699                 )
2700            ) {
2701             roundMask = 0;
2702  overflow:
2703             float_raise(float_flag_overflow | float_flag_inexact, status);
2704             if (    ( roundingMode == float_round_to_zero )
2705                  || ( zSign && ( roundingMode == float_round_up ) )
2706                  || ( ! zSign && ( roundingMode == float_round_down ) )
2707                ) {
2708                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2709             }
2710             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2711         }
2712         if ( zExp <= 0 ) {
2713             isTiny =
2714                    (status->float_detect_tininess
2715                     == float_tininess_before_rounding)
2716                 || ( zExp < 0 )
2717                 || ! increment
2718                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2719             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2720             zExp = 0;
2721             if (isTiny && zSig1) {
2722                 float_raise(float_flag_underflow, status);
2723             }
2724             if (zSig1) {
2725                 status->float_exception_flags |= float_flag_inexact;
2726             }
2727             switch (roundingMode) {
2728             case float_round_nearest_even:
2729             case float_round_ties_away:
2730                 increment = ((int64_t)zSig1 < 0);
2731                 break;
2732             case float_round_to_zero:
2733                 increment = 0;
2734                 break;
2735             case float_round_up:
2736                 increment = !zSign && zSig1;
2737                 break;
2738             case float_round_down:
2739                 increment = zSign && zSig1;
2740                 break;
2741             default:
2742                 abort();
2743             }
2744             if ( increment ) {
2745                 ++zSig0;
2746                 zSig0 &=
2747                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2748                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2749             }
2750             return packFloatx80( zSign, zExp, zSig0 );
2751         }
2752     }
2753     if (zSig1) {
2754         status->float_exception_flags |= float_flag_inexact;
2755     }
2756     if ( increment ) {
2757         ++zSig0;
2758         if ( zSig0 == 0 ) {
2759             ++zExp;
2760             zSig0 = LIT64( 0x8000000000000000 );
2761         }
2762         else {
2763             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2764         }
2765     }
2766     else {
2767         if ( zSig0 == 0 ) zExp = 0;
2768     }
2769     return packFloatx80( zSign, zExp, zSig0 );
2770 
2771 }
2772 
2773 /*----------------------------------------------------------------------------
2774 | Takes an abstract floating-point value having sign `zSign', exponent
2775 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2776 | and returns the proper extended double-precision floating-point value
2777 | corresponding to the abstract input.  This routine is just like
2778 | `roundAndPackFloatx80' except that the input significand does not have to be
2779 | normalized.
2780 *----------------------------------------------------------------------------*/
2781 
2782 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2783                                               flag zSign, int32_t zExp,
2784                                               uint64_t zSig0, uint64_t zSig1,
2785                                               float_status *status)
2786 {
2787     int8_t shiftCount;
2788 
2789     if ( zSig0 == 0 ) {
2790         zSig0 = zSig1;
2791         zSig1 = 0;
2792         zExp -= 64;
2793     }
2794     shiftCount = countLeadingZeros64( zSig0 );
2795     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2796     zExp -= shiftCount;
2797     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2798                                 zSig0, zSig1, status);
2799 
2800 }
2801 
2802 /*----------------------------------------------------------------------------
2803 | Returns the least-significant 64 fraction bits of the quadruple-precision
2804 | floating-point value `a'.
2805 *----------------------------------------------------------------------------*/
2806 
2807 static inline uint64_t extractFloat128Frac1( float128 a )
2808 {
2809 
2810     return a.low;
2811 
2812 }
2813 
2814 /*----------------------------------------------------------------------------
2815 | Returns the most-significant 48 fraction bits of the quadruple-precision
2816 | floating-point value `a'.
2817 *----------------------------------------------------------------------------*/
2818 
2819 static inline uint64_t extractFloat128Frac0( float128 a )
2820 {
2821 
2822     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2823 
2824 }
2825 
2826 /*----------------------------------------------------------------------------
2827 | Returns the exponent bits of the quadruple-precision floating-point value
2828 | `a'.
2829 *----------------------------------------------------------------------------*/
2830 
2831 static inline int32_t extractFloat128Exp( float128 a )
2832 {
2833 
2834     return ( a.high>>48 ) & 0x7FFF;
2835 
2836 }
2837 
2838 /*----------------------------------------------------------------------------
2839 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2840 *----------------------------------------------------------------------------*/
2841 
2842 static inline flag extractFloat128Sign( float128 a )
2843 {
2844 
2845     return a.high>>63;
2846 
2847 }
2848 
2849 /*----------------------------------------------------------------------------
2850 | Normalizes the subnormal quadruple-precision floating-point value
2851 | represented by the denormalized significand formed by the concatenation of
2852 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2853 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2854 | significand are stored at the location pointed to by `zSig0Ptr', and the
2855 | least significant 64 bits of the normalized significand are stored at the
2856 | location pointed to by `zSig1Ptr'.
2857 *----------------------------------------------------------------------------*/
2858 
2859 static void
2860  normalizeFloat128Subnormal(
2861      uint64_t aSig0,
2862      uint64_t aSig1,
2863      int32_t *zExpPtr,
2864      uint64_t *zSig0Ptr,
2865      uint64_t *zSig1Ptr
2866  )
2867 {
2868     int8_t shiftCount;
2869 
2870     if ( aSig0 == 0 ) {
2871         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2872         if ( shiftCount < 0 ) {
2873             *zSig0Ptr = aSig1>>( - shiftCount );
2874             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2875         }
2876         else {
2877             *zSig0Ptr = aSig1<<shiftCount;
2878             *zSig1Ptr = 0;
2879         }
2880         *zExpPtr = - shiftCount - 63;
2881     }
2882     else {
2883         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2884         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2885         *zExpPtr = 1 - shiftCount;
2886     }
2887 
2888 }
2889 
2890 /*----------------------------------------------------------------------------
2891 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2892 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2893 | floating-point value, returning the result.  After being shifted into the
2894 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2895 | added together to form the most significant 32 bits of the result.  This
2896 | means that any integer portion of `zSig0' will be added into the exponent.
2897 | Since a properly normalized significand will have an integer portion equal
2898 | to 1, the `zExp' input should be 1 less than the desired result exponent
2899 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2900 | significand.
2901 *----------------------------------------------------------------------------*/
2902 
2903 static inline float128
2904  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2905 {
2906     float128 z;
2907 
2908     z.low = zSig1;
2909     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2910     return z;
2911 
2912 }
2913 
2914 /*----------------------------------------------------------------------------
2915 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2916 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2917 | and `zSig2', and returns the proper quadruple-precision floating-point value
2918 | corresponding to the abstract input.  Ordinarily, the abstract value is
2919 | simply rounded and packed into the quadruple-precision format, with the
2920 | inexact exception raised if the abstract input cannot be represented
2921 | exactly.  However, if the abstract value is too large, the overflow and
2922 | inexact exceptions are raised and an infinity or maximal finite value is
2923 | returned.  If the abstract value is too small, the input value is rounded to
2924 | a subnormal number, and the underflow and inexact exceptions are raised if
2925 | the abstract input cannot be represented exactly as a subnormal quadruple-
2926 | precision floating-point number.
2927 |     The input significand must be normalized or smaller.  If the input
2928 | significand is not normalized, `zExp' must be 0; in that case, the result
2929 | returned is a subnormal number, and it must not require rounding.  In the
2930 | usual case that the input significand is normalized, `zExp' must be 1 less
2931 | than the ``true'' floating-point exponent.  The handling of underflow and
2932 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2933 *----------------------------------------------------------------------------*/
2934 
2935 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2936                                      uint64_t zSig0, uint64_t zSig1,
2937                                      uint64_t zSig2, float_status *status)
2938 {
2939     int8_t roundingMode;
2940     flag roundNearestEven, increment, isTiny;
2941 
2942     roundingMode = status->float_rounding_mode;
2943     roundNearestEven = ( roundingMode == float_round_nearest_even );
2944     switch (roundingMode) {
2945     case float_round_nearest_even:
2946     case float_round_ties_away:
2947         increment = ((int64_t)zSig2 < 0);
2948         break;
2949     case float_round_to_zero:
2950         increment = 0;
2951         break;
2952     case float_round_up:
2953         increment = !zSign && zSig2;
2954         break;
2955     case float_round_down:
2956         increment = zSign && zSig2;
2957         break;
2958     case float_round_to_odd:
2959         increment = !(zSig1 & 0x1) && zSig2;
2960         break;
2961     default:
2962         abort();
2963     }
2964     if ( 0x7FFD <= (uint32_t) zExp ) {
2965         if (    ( 0x7FFD < zExp )
2966              || (    ( zExp == 0x7FFD )
2967                   && eq128(
2968                          LIT64( 0x0001FFFFFFFFFFFF ),
2969                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2970                          zSig0,
2971                          zSig1
2972                      )
2973                   && increment
2974                 )
2975            ) {
2976             float_raise(float_flag_overflow | float_flag_inexact, status);
2977             if (    ( roundingMode == float_round_to_zero )
2978                  || ( zSign && ( roundingMode == float_round_up ) )
2979                  || ( ! zSign && ( roundingMode == float_round_down ) )
2980                  || (roundingMode == float_round_to_odd)
2981                ) {
2982                 return
2983                     packFloat128(
2984                         zSign,
2985                         0x7FFE,
2986                         LIT64( 0x0000FFFFFFFFFFFF ),
2987                         LIT64( 0xFFFFFFFFFFFFFFFF )
2988                     );
2989             }
2990             return packFloat128( zSign, 0x7FFF, 0, 0 );
2991         }
2992         if ( zExp < 0 ) {
2993             if (status->flush_to_zero) {
2994                 float_raise(float_flag_output_denormal, status);
2995                 return packFloat128(zSign, 0, 0, 0);
2996             }
2997             isTiny =
2998                    (status->float_detect_tininess
2999                     == float_tininess_before_rounding)
3000                 || ( zExp < -1 )
3001                 || ! increment
3002                 || lt128(
3003                        zSig0,
3004                        zSig1,
3005                        LIT64( 0x0001FFFFFFFFFFFF ),
3006                        LIT64( 0xFFFFFFFFFFFFFFFF )
3007                    );
3008             shift128ExtraRightJamming(
3009                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3010             zExp = 0;
3011             if (isTiny && zSig2) {
3012                 float_raise(float_flag_underflow, status);
3013             }
3014             switch (roundingMode) {
3015             case float_round_nearest_even:
3016             case float_round_ties_away:
3017                 increment = ((int64_t)zSig2 < 0);
3018                 break;
3019             case float_round_to_zero:
3020                 increment = 0;
3021                 break;
3022             case float_round_up:
3023                 increment = !zSign && zSig2;
3024                 break;
3025             case float_round_down:
3026                 increment = zSign && zSig2;
3027                 break;
3028             case float_round_to_odd:
3029                 increment = !(zSig1 & 0x1) && zSig2;
3030                 break;
3031             default:
3032                 abort();
3033             }
3034         }
3035     }
3036     if (zSig2) {
3037         status->float_exception_flags |= float_flag_inexact;
3038     }
3039     if ( increment ) {
3040         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3041         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3042     }
3043     else {
3044         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3045     }
3046     return packFloat128( zSign, zExp, zSig0, zSig1 );
3047 
3048 }
3049 
3050 /*----------------------------------------------------------------------------
3051 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3052 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3053 | returns the proper quadruple-precision floating-point value corresponding
3054 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3055 | except that the input significand has fewer bits and does not have to be
3056 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3057 | point exponent.
3058 *----------------------------------------------------------------------------*/
3059 
3060 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3061                                               uint64_t zSig0, uint64_t zSig1,
3062                                               float_status *status)
3063 {
3064     int8_t shiftCount;
3065     uint64_t zSig2;
3066 
3067     if ( zSig0 == 0 ) {
3068         zSig0 = zSig1;
3069         zSig1 = 0;
3070         zExp -= 64;
3071     }
3072     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3073     if ( 0 <= shiftCount ) {
3074         zSig2 = 0;
3075         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3076     }
3077     else {
3078         shift128ExtraRightJamming(
3079             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3080     }
3081     zExp -= shiftCount;
3082     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3083 
3084 }
3085 
3086 
3087 /*----------------------------------------------------------------------------
3088 | Returns the result of converting the 32-bit two's complement integer `a'
3089 | to the extended double-precision floating-point format.  The conversion
3090 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3091 | Arithmetic.
3092 *----------------------------------------------------------------------------*/
3093 
3094 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3095 {
3096     flag zSign;
3097     uint32_t absA;
3098     int8_t shiftCount;
3099     uint64_t zSig;
3100 
3101     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3102     zSign = ( a < 0 );
3103     absA = zSign ? - a : a;
3104     shiftCount = countLeadingZeros32( absA ) + 32;
3105     zSig = absA;
3106     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3107 
3108 }
3109 
3110 /*----------------------------------------------------------------------------
3111 | Returns the result of converting the 32-bit two's complement integer `a' to
3112 | the quadruple-precision floating-point format.  The conversion is performed
3113 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3114 *----------------------------------------------------------------------------*/
3115 
3116 float128 int32_to_float128(int32_t a, float_status *status)
3117 {
3118     flag zSign;
3119     uint32_t absA;
3120     int8_t shiftCount;
3121     uint64_t zSig0;
3122 
3123     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3124     zSign = ( a < 0 );
3125     absA = zSign ? - a : a;
3126     shiftCount = countLeadingZeros32( absA ) + 17;
3127     zSig0 = absA;
3128     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3129 
3130 }
3131 
3132 /*----------------------------------------------------------------------------
3133 | Returns the result of converting the 64-bit two's complement integer `a'
3134 | to the extended double-precision floating-point format.  The conversion
3135 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3136 | Arithmetic.
3137 *----------------------------------------------------------------------------*/
3138 
3139 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3140 {
3141     flag zSign;
3142     uint64_t absA;
3143     int8_t shiftCount;
3144 
3145     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3146     zSign = ( a < 0 );
3147     absA = zSign ? - a : a;
3148     shiftCount = countLeadingZeros64( absA );
3149     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3150 
3151 }
3152 
3153 /*----------------------------------------------------------------------------
3154 | Returns the result of converting the 64-bit two's complement integer `a' to
3155 | the quadruple-precision floating-point format.  The conversion is performed
3156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3157 *----------------------------------------------------------------------------*/
3158 
3159 float128 int64_to_float128(int64_t a, float_status *status)
3160 {
3161     flag zSign;
3162     uint64_t absA;
3163     int8_t shiftCount;
3164     int32_t zExp;
3165     uint64_t zSig0, zSig1;
3166 
3167     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3168     zSign = ( a < 0 );
3169     absA = zSign ? - a : a;
3170     shiftCount = countLeadingZeros64( absA ) + 49;
3171     zExp = 0x406E - shiftCount;
3172     if ( 64 <= shiftCount ) {
3173         zSig1 = 0;
3174         zSig0 = absA;
3175         shiftCount -= 64;
3176     }
3177     else {
3178         zSig1 = absA;
3179         zSig0 = 0;
3180     }
3181     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3182     return packFloat128( zSign, zExp, zSig0, zSig1 );
3183 
3184 }
3185 
3186 /*----------------------------------------------------------------------------
3187 | Returns the result of converting the 64-bit unsigned integer `a'
3188 | to the quadruple-precision floating-point format.  The conversion is performed
3189 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3190 *----------------------------------------------------------------------------*/
3191 
3192 float128 uint64_to_float128(uint64_t a, float_status *status)
3193 {
3194     if (a == 0) {
3195         return float128_zero;
3196     }
3197     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3198 }
3199 
3200 
3201 
3202 
3203 /*----------------------------------------------------------------------------
3204 | Returns the result of converting the single-precision floating-point value
3205 | `a' to the double-precision floating-point format.  The conversion is
3206 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3207 | Arithmetic.
3208 *----------------------------------------------------------------------------*/
3209 
3210 float64 float32_to_float64(float32 a, float_status *status)
3211 {
3212     flag aSign;
3213     int aExp;
3214     uint32_t aSig;
3215     a = float32_squash_input_denormal(a, status);
3216 
3217     aSig = extractFloat32Frac( a );
3218     aExp = extractFloat32Exp( a );
3219     aSign = extractFloat32Sign( a );
3220     if ( aExp == 0xFF ) {
3221         if (aSig) {
3222             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3223         }
3224         return packFloat64( aSign, 0x7FF, 0 );
3225     }
3226     if ( aExp == 0 ) {
3227         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3228         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3229         --aExp;
3230     }
3231     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3232 
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | Returns the result of converting the single-precision floating-point value
3237 | `a' to the extended double-precision floating-point format.  The conversion
3238 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3239 | Arithmetic.
3240 *----------------------------------------------------------------------------*/
3241 
3242 floatx80 float32_to_floatx80(float32 a, float_status *status)
3243 {
3244     flag aSign;
3245     int aExp;
3246     uint32_t aSig;
3247 
3248     a = float32_squash_input_denormal(a, status);
3249     aSig = extractFloat32Frac( a );
3250     aExp = extractFloat32Exp( a );
3251     aSign = extractFloat32Sign( a );
3252     if ( aExp == 0xFF ) {
3253         if (aSig) {
3254             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3255         }
3256         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3257     }
3258     if ( aExp == 0 ) {
3259         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3260         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3261     }
3262     aSig |= 0x00800000;
3263     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3264 
3265 }
3266 
3267 /*----------------------------------------------------------------------------
3268 | Returns the result of converting the single-precision floating-point value
3269 | `a' to the double-precision floating-point format.  The conversion is
3270 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3271 | Arithmetic.
3272 *----------------------------------------------------------------------------*/
3273 
3274 float128 float32_to_float128(float32 a, float_status *status)
3275 {
3276     flag aSign;
3277     int aExp;
3278     uint32_t aSig;
3279 
3280     a = float32_squash_input_denormal(a, status);
3281     aSig = extractFloat32Frac( a );
3282     aExp = extractFloat32Exp( a );
3283     aSign = extractFloat32Sign( a );
3284     if ( aExp == 0xFF ) {
3285         if (aSig) {
3286             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3287         }
3288         return packFloat128( aSign, 0x7FFF, 0, 0 );
3289     }
3290     if ( aExp == 0 ) {
3291         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3292         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3293         --aExp;
3294     }
3295     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3296 
3297 }
3298 
3299 /*----------------------------------------------------------------------------
3300 | Returns the remainder of the single-precision floating-point value `a'
3301 | with respect to the corresponding value `b'.  The operation is performed
3302 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3303 *----------------------------------------------------------------------------*/
3304 
3305 float32 float32_rem(float32 a, float32 b, float_status *status)
3306 {
3307     flag aSign, zSign;
3308     int aExp, bExp, expDiff;
3309     uint32_t aSig, bSig;
3310     uint32_t q;
3311     uint64_t aSig64, bSig64, q64;
3312     uint32_t alternateASig;
3313     int32_t sigMean;
3314     a = float32_squash_input_denormal(a, status);
3315     b = float32_squash_input_denormal(b, status);
3316 
3317     aSig = extractFloat32Frac( a );
3318     aExp = extractFloat32Exp( a );
3319     aSign = extractFloat32Sign( a );
3320     bSig = extractFloat32Frac( b );
3321     bExp = extractFloat32Exp( b );
3322     if ( aExp == 0xFF ) {
3323         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3324             return propagateFloat32NaN(a, b, status);
3325         }
3326         float_raise(float_flag_invalid, status);
3327         return float32_default_nan(status);
3328     }
3329     if ( bExp == 0xFF ) {
3330         if (bSig) {
3331             return propagateFloat32NaN(a, b, status);
3332         }
3333         return a;
3334     }
3335     if ( bExp == 0 ) {
3336         if ( bSig == 0 ) {
3337             float_raise(float_flag_invalid, status);
3338             return float32_default_nan(status);
3339         }
3340         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3341     }
3342     if ( aExp == 0 ) {
3343         if ( aSig == 0 ) return a;
3344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3345     }
3346     expDiff = aExp - bExp;
3347     aSig |= 0x00800000;
3348     bSig |= 0x00800000;
3349     if ( expDiff < 32 ) {
3350         aSig <<= 8;
3351         bSig <<= 8;
3352         if ( expDiff < 0 ) {
3353             if ( expDiff < -1 ) return a;
3354             aSig >>= 1;
3355         }
3356         q = ( bSig <= aSig );
3357         if ( q ) aSig -= bSig;
3358         if ( 0 < expDiff ) {
3359             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3360             q >>= 32 - expDiff;
3361             bSig >>= 2;
3362             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3363         }
3364         else {
3365             aSig >>= 2;
3366             bSig >>= 2;
3367         }
3368     }
3369     else {
3370         if ( bSig <= aSig ) aSig -= bSig;
3371         aSig64 = ( (uint64_t) aSig )<<40;
3372         bSig64 = ( (uint64_t) bSig )<<40;
3373         expDiff -= 64;
3374         while ( 0 < expDiff ) {
3375             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3376             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3377             aSig64 = - ( ( bSig * q64 )<<38 );
3378             expDiff -= 62;
3379         }
3380         expDiff += 64;
3381         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3382         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3383         q = q64>>( 64 - expDiff );
3384         bSig <<= 6;
3385         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3386     }
3387     do {
3388         alternateASig = aSig;
3389         ++q;
3390         aSig -= bSig;
3391     } while ( 0 <= (int32_t) aSig );
3392     sigMean = aSig + alternateASig;
3393     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3394         aSig = alternateASig;
3395     }
3396     zSign = ( (int32_t) aSig < 0 );
3397     if ( zSign ) aSig = - aSig;
3398     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3399 }
3400 
3401 
3402 
3403 /*----------------------------------------------------------------------------
3404 | Returns the binary exponential of the single-precision floating-point value
3405 | `a'. The operation is performed according to the IEC/IEEE Standard for
3406 | Binary Floating-Point Arithmetic.
3407 |
3408 | Uses the following identities:
3409 |
3410 | 1. -------------------------------------------------------------------------
3411 |      x    x*ln(2)
3412 |     2  = e
3413 |
3414 | 2. -------------------------------------------------------------------------
3415 |                      2     3     4     5           n
3416 |      x        x     x     x     x     x           x
3417 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3418 |               1!    2!    3!    4!    5!          n!
3419 *----------------------------------------------------------------------------*/
3420 
3421 static const float64 float32_exp2_coefficients[15] =
3422 {
3423     const_float64( 0x3ff0000000000000ll ), /*  1 */
3424     const_float64( 0x3fe0000000000000ll ), /*  2 */
3425     const_float64( 0x3fc5555555555555ll ), /*  3 */
3426     const_float64( 0x3fa5555555555555ll ), /*  4 */
3427     const_float64( 0x3f81111111111111ll ), /*  5 */
3428     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3429     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3430     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3431     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3432     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3433     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3434     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3435     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3436     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3437     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3438 };
3439 
3440 float32 float32_exp2(float32 a, float_status *status)
3441 {
3442     flag aSign;
3443     int aExp;
3444     uint32_t aSig;
3445     float64 r, x, xn;
3446     int i;
3447     a = float32_squash_input_denormal(a, status);
3448 
3449     aSig = extractFloat32Frac( a );
3450     aExp = extractFloat32Exp( a );
3451     aSign = extractFloat32Sign( a );
3452 
3453     if ( aExp == 0xFF) {
3454         if (aSig) {
3455             return propagateFloat32NaN(a, float32_zero, status);
3456         }
3457         return (aSign) ? float32_zero : a;
3458     }
3459     if (aExp == 0) {
3460         if (aSig == 0) return float32_one;
3461     }
3462 
3463     float_raise(float_flag_inexact, status);
3464 
3465     /* ******************************* */
3466     /* using float64 for approximation */
3467     /* ******************************* */
3468     x = float32_to_float64(a, status);
3469     x = float64_mul(x, float64_ln2, status);
3470 
3471     xn = x;
3472     r = float64_one;
3473     for (i = 0 ; i < 15 ; i++) {
3474         float64 f;
3475 
3476         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3477         r = float64_add(r, f, status);
3478 
3479         xn = float64_mul(xn, x, status);
3480     }
3481 
3482     return float64_to_float32(r, status);
3483 }
3484 
3485 /*----------------------------------------------------------------------------
3486 | Returns the binary log of the single-precision floating-point value `a'.
3487 | The operation is performed according to the IEC/IEEE Standard for Binary
3488 | Floating-Point Arithmetic.
3489 *----------------------------------------------------------------------------*/
3490 float32 float32_log2(float32 a, float_status *status)
3491 {
3492     flag aSign, zSign;
3493     int aExp;
3494     uint32_t aSig, zSig, i;
3495 
3496     a = float32_squash_input_denormal(a, status);
3497     aSig = extractFloat32Frac( a );
3498     aExp = extractFloat32Exp( a );
3499     aSign = extractFloat32Sign( a );
3500 
3501     if ( aExp == 0 ) {
3502         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3503         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3504     }
3505     if ( aSign ) {
3506         float_raise(float_flag_invalid, status);
3507         return float32_default_nan(status);
3508     }
3509     if ( aExp == 0xFF ) {
3510         if (aSig) {
3511             return propagateFloat32NaN(a, float32_zero, status);
3512         }
3513         return a;
3514     }
3515 
3516     aExp -= 0x7F;
3517     aSig |= 0x00800000;
3518     zSign = aExp < 0;
3519     zSig = aExp << 23;
3520 
3521     for (i = 1 << 22; i > 0; i >>= 1) {
3522         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3523         if ( aSig & 0x01000000 ) {
3524             aSig >>= 1;
3525             zSig |= i;
3526         }
3527     }
3528 
3529     if ( zSign )
3530         zSig = -zSig;
3531 
3532     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3533 }
3534 
3535 /*----------------------------------------------------------------------------
3536 | Returns 1 if the single-precision floating-point value `a' is equal to
3537 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3538 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3539 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3540 *----------------------------------------------------------------------------*/
3541 
3542 int float32_eq(float32 a, float32 b, float_status *status)
3543 {
3544     uint32_t av, bv;
3545     a = float32_squash_input_denormal(a, status);
3546     b = float32_squash_input_denormal(b, status);
3547 
3548     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3549          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3550        ) {
3551         float_raise(float_flag_invalid, status);
3552         return 0;
3553     }
3554     av = float32_val(a);
3555     bv = float32_val(b);
3556     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3557 }
3558 
3559 /*----------------------------------------------------------------------------
3560 | Returns 1 if the single-precision floating-point value `a' is less than
3561 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3562 | exception is raised if either operand is a NaN.  The comparison is performed
3563 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3564 *----------------------------------------------------------------------------*/
3565 
3566 int float32_le(float32 a, float32 b, float_status *status)
3567 {
3568     flag aSign, bSign;
3569     uint32_t av, bv;
3570     a = float32_squash_input_denormal(a, status);
3571     b = float32_squash_input_denormal(b, status);
3572 
3573     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3574          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3575        ) {
3576         float_raise(float_flag_invalid, status);
3577         return 0;
3578     }
3579     aSign = extractFloat32Sign( a );
3580     bSign = extractFloat32Sign( b );
3581     av = float32_val(a);
3582     bv = float32_val(b);
3583     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3584     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3585 
3586 }
3587 
3588 /*----------------------------------------------------------------------------
3589 | Returns 1 if the single-precision floating-point value `a' is less than
3590 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3591 | raised if either operand is a NaN.  The comparison is performed according
3592 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3594 
3595 int float32_lt(float32 a, float32 b, float_status *status)
3596 {
3597     flag aSign, bSign;
3598     uint32_t av, bv;
3599     a = float32_squash_input_denormal(a, status);
3600     b = float32_squash_input_denormal(b, status);
3601 
3602     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3603          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3604        ) {
3605         float_raise(float_flag_invalid, status);
3606         return 0;
3607     }
3608     aSign = extractFloat32Sign( a );
3609     bSign = extractFloat32Sign( b );
3610     av = float32_val(a);
3611     bv = float32_val(b);
3612     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3613     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3614 
3615 }
3616 
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3619 | be compared, and 0 otherwise.  The invalid exception is raised if either
3620 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3621 | Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3623 
3624 int float32_unordered(float32 a, float32 b, float_status *status)
3625 {
3626     a = float32_squash_input_denormal(a, status);
3627     b = float32_squash_input_denormal(b, status);
3628 
3629     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3630          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3631        ) {
3632         float_raise(float_flag_invalid, status);
3633         return 1;
3634     }
3635     return 0;
3636 }
3637 
3638 /*----------------------------------------------------------------------------
3639 | Returns 1 if the single-precision floating-point value `a' is equal to
3640 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3641 | exception.  The comparison is performed according to the IEC/IEEE Standard
3642 | for Binary Floating-Point Arithmetic.
3643 *----------------------------------------------------------------------------*/
3644 
3645 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3646 {
3647     a = float32_squash_input_denormal(a, status);
3648     b = float32_squash_input_denormal(b, status);
3649 
3650     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3651          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3652        ) {
3653         if (float32_is_signaling_nan(a, status)
3654          || float32_is_signaling_nan(b, status)) {
3655             float_raise(float_flag_invalid, status);
3656         }
3657         return 0;
3658     }
3659     return ( float32_val(a) == float32_val(b) ) ||
3660             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3661 }
3662 
3663 /*----------------------------------------------------------------------------
3664 | Returns 1 if the single-precision floating-point value `a' is less than or
3665 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3666 | cause an exception.  Otherwise, the comparison is performed according to the
3667 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3668 *----------------------------------------------------------------------------*/
3669 
3670 int float32_le_quiet(float32 a, float32 b, float_status *status)
3671 {
3672     flag aSign, bSign;
3673     uint32_t av, bv;
3674     a = float32_squash_input_denormal(a, status);
3675     b = float32_squash_input_denormal(b, status);
3676 
3677     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3678          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3679        ) {
3680         if (float32_is_signaling_nan(a, status)
3681          || float32_is_signaling_nan(b, status)) {
3682             float_raise(float_flag_invalid, status);
3683         }
3684         return 0;
3685     }
3686     aSign = extractFloat32Sign( a );
3687     bSign = extractFloat32Sign( b );
3688     av = float32_val(a);
3689     bv = float32_val(b);
3690     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3691     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3692 
3693 }
3694 
3695 /*----------------------------------------------------------------------------
3696 | Returns 1 if the single-precision floating-point value `a' is less than
3697 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3698 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3699 | Standard for Binary Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3701 
3702 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3703 {
3704     flag aSign, bSign;
3705     uint32_t av, bv;
3706     a = float32_squash_input_denormal(a, status);
3707     b = float32_squash_input_denormal(b, status);
3708 
3709     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3710          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3711        ) {
3712         if (float32_is_signaling_nan(a, status)
3713          || float32_is_signaling_nan(b, status)) {
3714             float_raise(float_flag_invalid, status);
3715         }
3716         return 0;
3717     }
3718     aSign = extractFloat32Sign( a );
3719     bSign = extractFloat32Sign( b );
3720     av = float32_val(a);
3721     bv = float32_val(b);
3722     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3723     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3724 
3725 }
3726 
3727 /*----------------------------------------------------------------------------
3728 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3729 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3730 | comparison is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic.
3732 *----------------------------------------------------------------------------*/
3733 
3734 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3735 {
3736     a = float32_squash_input_denormal(a, status);
3737     b = float32_squash_input_denormal(b, status);
3738 
3739     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3740          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3741        ) {
3742         if (float32_is_signaling_nan(a, status)
3743          || float32_is_signaling_nan(b, status)) {
3744             float_raise(float_flag_invalid, status);
3745         }
3746         return 1;
3747     }
3748     return 0;
3749 }
3750 
3751 
3752 /*----------------------------------------------------------------------------
3753 | Returns the result of converting the double-precision floating-point value
3754 | `a' to the single-precision floating-point format.  The conversion is
3755 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3756 | Arithmetic.
3757 *----------------------------------------------------------------------------*/
3758 
3759 float32 float64_to_float32(float64 a, float_status *status)
3760 {
3761     flag aSign;
3762     int aExp;
3763     uint64_t aSig;
3764     uint32_t zSig;
3765     a = float64_squash_input_denormal(a, status);
3766 
3767     aSig = extractFloat64Frac( a );
3768     aExp = extractFloat64Exp( a );
3769     aSign = extractFloat64Sign( a );
3770     if ( aExp == 0x7FF ) {
3771         if (aSig) {
3772             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3773         }
3774         return packFloat32( aSign, 0xFF, 0 );
3775     }
3776     shift64RightJamming( aSig, 22, &aSig );
3777     zSig = aSig;
3778     if ( aExp || zSig ) {
3779         zSig |= 0x40000000;
3780         aExp -= 0x381;
3781     }
3782     return roundAndPackFloat32(aSign, aExp, zSig, status);
3783 
3784 }
3785 
3786 
3787 /*----------------------------------------------------------------------------
3788 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3789 | half-precision floating-point value, returning the result.  After being
3790 | shifted into the proper positions, the three fields are simply added
3791 | together to form the result.  This means that any integer portion of `zSig'
3792 | will be added into the exponent.  Since a properly normalized significand
3793 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3794 | than the desired result exponent whenever `zSig' is a complete, normalized
3795 | significand.
3796 *----------------------------------------------------------------------------*/
3797 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3798 {
3799     return make_float16(
3800         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3801 }
3802 
3803 /*----------------------------------------------------------------------------
3804 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3805 | and significand `zSig', and returns the proper half-precision floating-
3806 | point value corresponding to the abstract input.  Ordinarily, the abstract
3807 | value is simply rounded and packed into the half-precision format, with
3808 | the inexact exception raised if the abstract input cannot be represented
3809 | exactly.  However, if the abstract value is too large, the overflow and
3810 | inexact exceptions are raised and an infinity or maximal finite value is
3811 | returned.  If the abstract value is too small, the input value is rounded to
3812 | a subnormal number, and the underflow and inexact exceptions are raised if
3813 | the abstract input cannot be represented exactly as a subnormal half-
3814 | precision floating-point number.
3815 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3816 | ARM-style "alternative representation", which omits the NaN and Inf
3817 | encodings in order to raise the maximum representable exponent by one.
3818 |     The input significand `zSig' has its binary point between bits 22
3819 | and 23, which is 13 bits to the left of the usual location.  This shifted
3820 | significand must be normalized or smaller.  If `zSig' is not normalized,
3821 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3822 | and it must not require rounding.  In the usual case that `zSig' is
3823 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3824 | Note the slightly odd position of the binary point in zSig compared with the
3825 | other roundAndPackFloat functions. This should probably be fixed if we
3826 | need to implement more float16 routines than just conversion.
3827 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3828 | Binary Floating-Point Arithmetic.
3829 *----------------------------------------------------------------------------*/
3830 
3831 static float16 roundAndPackFloat16(flag zSign, int zExp,
3832                                    uint32_t zSig, flag ieee,
3833                                    float_status *status)
3834 {
3835     int maxexp = ieee ? 29 : 30;
3836     uint32_t mask;
3837     uint32_t increment;
3838     bool rounding_bumps_exp;
3839     bool is_tiny = false;
3840 
3841     /* Calculate the mask of bits of the mantissa which are not
3842      * representable in half-precision and will be lost.
3843      */
3844     if (zExp < 1) {
3845         /* Will be denormal in halfprec */
3846         mask = 0x00ffffff;
3847         if (zExp >= -11) {
3848             mask >>= 11 + zExp;
3849         }
3850     } else {
3851         /* Normal number in halfprec */
3852         mask = 0x00001fff;
3853     }
3854 
3855     switch (status->float_rounding_mode) {
3856     case float_round_nearest_even:
3857         increment = (mask + 1) >> 1;
3858         if ((zSig & mask) == increment) {
3859             increment = zSig & (increment << 1);
3860         }
3861         break;
3862     case float_round_ties_away:
3863         increment = (mask + 1) >> 1;
3864         break;
3865     case float_round_up:
3866         increment = zSign ? 0 : mask;
3867         break;
3868     case float_round_down:
3869         increment = zSign ? mask : 0;
3870         break;
3871     default: /* round_to_zero */
3872         increment = 0;
3873         break;
3874     }
3875 
3876     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3877 
3878     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3879         if (ieee) {
3880             float_raise(float_flag_overflow | float_flag_inexact, status);
3881             return packFloat16(zSign, 0x1f, 0);
3882         } else {
3883             float_raise(float_flag_invalid, status);
3884             return packFloat16(zSign, 0x1f, 0x3ff);
3885         }
3886     }
3887 
3888     if (zExp < 0) {
3889         /* Note that flush-to-zero does not affect half-precision results */
3890         is_tiny =
3891             (status->float_detect_tininess == float_tininess_before_rounding)
3892             || (zExp < -1)
3893             || (!rounding_bumps_exp);
3894     }
3895     if (zSig & mask) {
3896         float_raise(float_flag_inexact, status);
3897         if (is_tiny) {
3898             float_raise(float_flag_underflow, status);
3899         }
3900     }
3901 
3902     zSig += increment;
3903     if (rounding_bumps_exp) {
3904         zSig >>= 1;
3905         zExp++;
3906     }
3907 
3908     if (zExp < -10) {
3909         return packFloat16(zSign, 0, 0);
3910     }
3911     if (zExp < 0) {
3912         zSig >>= -zExp;
3913         zExp = 0;
3914     }
3915     return packFloat16(zSign, zExp, zSig >> 13);
3916 }
3917 
3918 /*----------------------------------------------------------------------------
3919 | If `a' is denormal and we are in flush-to-zero mode then set the
3920 | input-denormal exception and return zero. Otherwise just return the value.
3921 *----------------------------------------------------------------------------*/
3922 float16 float16_squash_input_denormal(float16 a, float_status *status)
3923 {
3924     if (status->flush_inputs_to_zero) {
3925         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3926             float_raise(float_flag_input_denormal, status);
3927             return make_float16(float16_val(a) & 0x8000);
3928         }
3929     }
3930     return a;
3931 }
3932 
3933 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3934                                       uint32_t *zSigPtr)
3935 {
3936     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3937     *zSigPtr = aSig << shiftCount;
3938     *zExpPtr = 1 - shiftCount;
3939 }
3940 
3941 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3942    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3943 
3944 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3945 {
3946     flag aSign;
3947     int aExp;
3948     uint32_t aSig;
3949 
3950     aSign = extractFloat16Sign(a);
3951     aExp = extractFloat16Exp(a);
3952     aSig = extractFloat16Frac(a);
3953 
3954     if (aExp == 0x1f && ieee) {
3955         if (aSig) {
3956             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3957         }
3958         return packFloat32(aSign, 0xff, 0);
3959     }
3960     if (aExp == 0) {
3961         if (aSig == 0) {
3962             return packFloat32(aSign, 0, 0);
3963         }
3964 
3965         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3966         aExp--;
3967     }
3968     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3969 }
3970 
3971 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3972 {
3973     flag aSign;
3974     int aExp;
3975     uint32_t aSig;
3976 
3977     a = float32_squash_input_denormal(a, status);
3978 
3979     aSig = extractFloat32Frac( a );
3980     aExp = extractFloat32Exp( a );
3981     aSign = extractFloat32Sign( a );
3982     if ( aExp == 0xFF ) {
3983         if (aSig) {
3984             /* Input is a NaN */
3985             if (!ieee) {
3986                 float_raise(float_flag_invalid, status);
3987                 return packFloat16(aSign, 0, 0);
3988             }
3989             return commonNaNToFloat16(
3990                 float32ToCommonNaN(a, status), status);
3991         }
3992         /* Infinity */
3993         if (!ieee) {
3994             float_raise(float_flag_invalid, status);
3995             return packFloat16(aSign, 0x1f, 0x3ff);
3996         }
3997         return packFloat16(aSign, 0x1f, 0);
3998     }
3999     if (aExp == 0 && aSig == 0) {
4000         return packFloat16(aSign, 0, 0);
4001     }
4002     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4003      * even if the input is denormal; however this is harmless because
4004      * the largest possible single-precision denormal is still smaller
4005      * than the smallest representable half-precision denormal, and so we
4006      * will end up ignoring aSig and returning via the "always return zero"
4007      * codepath.
4008      */
4009     aSig |= 0x00800000;
4010     aExp -= 0x71;
4011 
4012     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
4013 }
4014 
4015 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
4016 {
4017     flag aSign;
4018     int aExp;
4019     uint32_t aSig;
4020 
4021     aSign = extractFloat16Sign(a);
4022     aExp = extractFloat16Exp(a);
4023     aSig = extractFloat16Frac(a);
4024 
4025     if (aExp == 0x1f && ieee) {
4026         if (aSig) {
4027             return commonNaNToFloat64(
4028                 float16ToCommonNaN(a, status), status);
4029         }
4030         return packFloat64(aSign, 0x7ff, 0);
4031     }
4032     if (aExp == 0) {
4033         if (aSig == 0) {
4034             return packFloat64(aSign, 0, 0);
4035         }
4036 
4037         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
4038         aExp--;
4039     }
4040     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
4041 }
4042 
4043 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
4044 {
4045     flag aSign;
4046     int aExp;
4047     uint64_t aSig;
4048     uint32_t zSig;
4049 
4050     a = float64_squash_input_denormal(a, status);
4051 
4052     aSig = extractFloat64Frac(a);
4053     aExp = extractFloat64Exp(a);
4054     aSign = extractFloat64Sign(a);
4055     if (aExp == 0x7FF) {
4056         if (aSig) {
4057             /* Input is a NaN */
4058             if (!ieee) {
4059                 float_raise(float_flag_invalid, status);
4060                 return packFloat16(aSign, 0, 0);
4061             }
4062             return commonNaNToFloat16(
4063                 float64ToCommonNaN(a, status), status);
4064         }
4065         /* Infinity */
4066         if (!ieee) {
4067             float_raise(float_flag_invalid, status);
4068             return packFloat16(aSign, 0x1f, 0x3ff);
4069         }
4070         return packFloat16(aSign, 0x1f, 0);
4071     }
4072     shift64RightJamming(aSig, 29, &aSig);
4073     zSig = aSig;
4074     if (aExp == 0 && zSig == 0) {
4075         return packFloat16(aSign, 0, 0);
4076     }
4077     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4078      * even if the input is denormal; however this is harmless because
4079      * the largest possible single-precision denormal is still smaller
4080      * than the smallest representable half-precision denormal, and so we
4081      * will end up ignoring aSig and returning via the "always return zero"
4082      * codepath.
4083      */
4084     zSig |= 0x00800000;
4085     aExp -= 0x3F1;
4086 
4087     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4088 }
4089 
4090 /*----------------------------------------------------------------------------
4091 | Returns the result of converting the double-precision floating-point value
4092 | `a' to the extended double-precision floating-point format.  The conversion
4093 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4094 | Arithmetic.
4095 *----------------------------------------------------------------------------*/
4096 
4097 floatx80 float64_to_floatx80(float64 a, float_status *status)
4098 {
4099     flag aSign;
4100     int aExp;
4101     uint64_t aSig;
4102 
4103     a = float64_squash_input_denormal(a, status);
4104     aSig = extractFloat64Frac( a );
4105     aExp = extractFloat64Exp( a );
4106     aSign = extractFloat64Sign( a );
4107     if ( aExp == 0x7FF ) {
4108         if (aSig) {
4109             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4110         }
4111         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4112     }
4113     if ( aExp == 0 ) {
4114         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4115         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4116     }
4117     return
4118         packFloatx80(
4119             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4120 
4121 }
4122 
4123 /*----------------------------------------------------------------------------
4124 | Returns the result of converting the double-precision floating-point value
4125 | `a' to the quadruple-precision floating-point format.  The conversion is
4126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4127 | Arithmetic.
4128 *----------------------------------------------------------------------------*/
4129 
4130 float128 float64_to_float128(float64 a, float_status *status)
4131 {
4132     flag aSign;
4133     int aExp;
4134     uint64_t aSig, zSig0, zSig1;
4135 
4136     a = float64_squash_input_denormal(a, status);
4137     aSig = extractFloat64Frac( a );
4138     aExp = extractFloat64Exp( a );
4139     aSign = extractFloat64Sign( a );
4140     if ( aExp == 0x7FF ) {
4141         if (aSig) {
4142             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4143         }
4144         return packFloat128( aSign, 0x7FFF, 0, 0 );
4145     }
4146     if ( aExp == 0 ) {
4147         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4148         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4149         --aExp;
4150     }
4151     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4152     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4153 
4154 }
4155 
4156 
4157 /*----------------------------------------------------------------------------
4158 | Returns the remainder of the double-precision floating-point value `a'
4159 | with respect to the corresponding value `b'.  The operation is performed
4160 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4161 *----------------------------------------------------------------------------*/
4162 
4163 float64 float64_rem(float64 a, float64 b, float_status *status)
4164 {
4165     flag aSign, zSign;
4166     int aExp, bExp, expDiff;
4167     uint64_t aSig, bSig;
4168     uint64_t q, alternateASig;
4169     int64_t sigMean;
4170 
4171     a = float64_squash_input_denormal(a, status);
4172     b = float64_squash_input_denormal(b, status);
4173     aSig = extractFloat64Frac( a );
4174     aExp = extractFloat64Exp( a );
4175     aSign = extractFloat64Sign( a );
4176     bSig = extractFloat64Frac( b );
4177     bExp = extractFloat64Exp( b );
4178     if ( aExp == 0x7FF ) {
4179         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4180             return propagateFloat64NaN(a, b, status);
4181         }
4182         float_raise(float_flag_invalid, status);
4183         return float64_default_nan(status);
4184     }
4185     if ( bExp == 0x7FF ) {
4186         if (bSig) {
4187             return propagateFloat64NaN(a, b, status);
4188         }
4189         return a;
4190     }
4191     if ( bExp == 0 ) {
4192         if ( bSig == 0 ) {
4193             float_raise(float_flag_invalid, status);
4194             return float64_default_nan(status);
4195         }
4196         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4197     }
4198     if ( aExp == 0 ) {
4199         if ( aSig == 0 ) return a;
4200         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4201     }
4202     expDiff = aExp - bExp;
4203     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4204     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4205     if ( expDiff < 0 ) {
4206         if ( expDiff < -1 ) return a;
4207         aSig >>= 1;
4208     }
4209     q = ( bSig <= aSig );
4210     if ( q ) aSig -= bSig;
4211     expDiff -= 64;
4212     while ( 0 < expDiff ) {
4213         q = estimateDiv128To64( aSig, 0, bSig );
4214         q = ( 2 < q ) ? q - 2 : 0;
4215         aSig = - ( ( bSig>>2 ) * q );
4216         expDiff -= 62;
4217     }
4218     expDiff += 64;
4219     if ( 0 < expDiff ) {
4220         q = estimateDiv128To64( aSig, 0, bSig );
4221         q = ( 2 < q ) ? q - 2 : 0;
4222         q >>= 64 - expDiff;
4223         bSig >>= 2;
4224         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4225     }
4226     else {
4227         aSig >>= 2;
4228         bSig >>= 2;
4229     }
4230     do {
4231         alternateASig = aSig;
4232         ++q;
4233         aSig -= bSig;
4234     } while ( 0 <= (int64_t) aSig );
4235     sigMean = aSig + alternateASig;
4236     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4237         aSig = alternateASig;
4238     }
4239     zSign = ( (int64_t) aSig < 0 );
4240     if ( zSign ) aSig = - aSig;
4241     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4242 
4243 }
4244 
4245 /*----------------------------------------------------------------------------
4246 | Returns the binary log of the double-precision floating-point value `a'.
4247 | The operation is performed according to the IEC/IEEE Standard for Binary
4248 | Floating-Point Arithmetic.
4249 *----------------------------------------------------------------------------*/
4250 float64 float64_log2(float64 a, float_status *status)
4251 {
4252     flag aSign, zSign;
4253     int aExp;
4254     uint64_t aSig, aSig0, aSig1, zSig, i;
4255     a = float64_squash_input_denormal(a, status);
4256 
4257     aSig = extractFloat64Frac( a );
4258     aExp = extractFloat64Exp( a );
4259     aSign = extractFloat64Sign( a );
4260 
4261     if ( aExp == 0 ) {
4262         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4263         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4264     }
4265     if ( aSign ) {
4266         float_raise(float_flag_invalid, status);
4267         return float64_default_nan(status);
4268     }
4269     if ( aExp == 0x7FF ) {
4270         if (aSig) {
4271             return propagateFloat64NaN(a, float64_zero, status);
4272         }
4273         return a;
4274     }
4275 
4276     aExp -= 0x3FF;
4277     aSig |= LIT64( 0x0010000000000000 );
4278     zSign = aExp < 0;
4279     zSig = (uint64_t)aExp << 52;
4280     for (i = 1LL << 51; i > 0; i >>= 1) {
4281         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4282         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4283         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4284             aSig >>= 1;
4285             zSig |= i;
4286         }
4287     }
4288 
4289     if ( zSign )
4290         zSig = -zSig;
4291     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4292 }
4293 
4294 /*----------------------------------------------------------------------------
4295 | Returns 1 if the double-precision floating-point value `a' is equal to the
4296 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4297 | if either operand is a NaN.  Otherwise, the comparison is performed
4298 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4299 *----------------------------------------------------------------------------*/
4300 
4301 int float64_eq(float64 a, float64 b, float_status *status)
4302 {
4303     uint64_t av, bv;
4304     a = float64_squash_input_denormal(a, status);
4305     b = float64_squash_input_denormal(b, status);
4306 
4307     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4308          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4309        ) {
4310         float_raise(float_flag_invalid, status);
4311         return 0;
4312     }
4313     av = float64_val(a);
4314     bv = float64_val(b);
4315     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4316 
4317 }
4318 
4319 /*----------------------------------------------------------------------------
4320 | Returns 1 if the double-precision floating-point value `a' is less than or
4321 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4322 | exception is raised if either operand is a NaN.  The comparison is performed
4323 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4324 *----------------------------------------------------------------------------*/
4325 
4326 int float64_le(float64 a, float64 b, float_status *status)
4327 {
4328     flag aSign, bSign;
4329     uint64_t av, bv;
4330     a = float64_squash_input_denormal(a, status);
4331     b = float64_squash_input_denormal(b, status);
4332 
4333     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4334          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4335        ) {
4336         float_raise(float_flag_invalid, status);
4337         return 0;
4338     }
4339     aSign = extractFloat64Sign( a );
4340     bSign = extractFloat64Sign( b );
4341     av = float64_val(a);
4342     bv = float64_val(b);
4343     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4344     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4345 
4346 }
4347 
4348 /*----------------------------------------------------------------------------
4349 | Returns 1 if the double-precision floating-point value `a' is less than
4350 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4351 | raised if either operand is a NaN.  The comparison is performed according
4352 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4353 *----------------------------------------------------------------------------*/
4354 
4355 int float64_lt(float64 a, float64 b, float_status *status)
4356 {
4357     flag aSign, bSign;
4358     uint64_t av, bv;
4359 
4360     a = float64_squash_input_denormal(a, status);
4361     b = float64_squash_input_denormal(b, status);
4362     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4363          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4364        ) {
4365         float_raise(float_flag_invalid, status);
4366         return 0;
4367     }
4368     aSign = extractFloat64Sign( a );
4369     bSign = extractFloat64Sign( b );
4370     av = float64_val(a);
4371     bv = float64_val(b);
4372     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4373     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4374 
4375 }
4376 
4377 /*----------------------------------------------------------------------------
4378 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4379 | be compared, and 0 otherwise.  The invalid exception is raised if either
4380 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4381 | Standard for Binary Floating-Point Arithmetic.
4382 *----------------------------------------------------------------------------*/
4383 
4384 int float64_unordered(float64 a, float64 b, float_status *status)
4385 {
4386     a = float64_squash_input_denormal(a, status);
4387     b = float64_squash_input_denormal(b, status);
4388 
4389     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4390          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4391        ) {
4392         float_raise(float_flag_invalid, status);
4393         return 1;
4394     }
4395     return 0;
4396 }
4397 
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point value `a' is equal to the
4400 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4401 | exception.The comparison is performed according to the IEC/IEEE Standard
4402 | for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4404 
4405 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4406 {
4407     uint64_t av, bv;
4408     a = float64_squash_input_denormal(a, status);
4409     b = float64_squash_input_denormal(b, status);
4410 
4411     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4412          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4413        ) {
4414         if (float64_is_signaling_nan(a, status)
4415          || float64_is_signaling_nan(b, status)) {
4416             float_raise(float_flag_invalid, status);
4417         }
4418         return 0;
4419     }
4420     av = float64_val(a);
4421     bv = float64_val(b);
4422     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4423 
4424 }
4425 
4426 /*----------------------------------------------------------------------------
4427 | Returns 1 if the double-precision floating-point value `a' is less than or
4428 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4429 | cause an exception.  Otherwise, the comparison is performed according to the
4430 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4431 *----------------------------------------------------------------------------*/
4432 
4433 int float64_le_quiet(float64 a, float64 b, float_status *status)
4434 {
4435     flag aSign, bSign;
4436     uint64_t av, bv;
4437     a = float64_squash_input_denormal(a, status);
4438     b = float64_squash_input_denormal(b, status);
4439 
4440     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4442        ) {
4443         if (float64_is_signaling_nan(a, status)
4444          || float64_is_signaling_nan(b, status)) {
4445             float_raise(float_flag_invalid, status);
4446         }
4447         return 0;
4448     }
4449     aSign = extractFloat64Sign( a );
4450     bSign = extractFloat64Sign( b );
4451     av = float64_val(a);
4452     bv = float64_val(b);
4453     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4454     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4455 
4456 }
4457 
4458 /*----------------------------------------------------------------------------
4459 | Returns 1 if the double-precision floating-point value `a' is less than
4460 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4461 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4462 | Standard for Binary Floating-Point Arithmetic.
4463 *----------------------------------------------------------------------------*/
4464 
4465 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4466 {
4467     flag aSign, bSign;
4468     uint64_t av, bv;
4469     a = float64_squash_input_denormal(a, status);
4470     b = float64_squash_input_denormal(b, status);
4471 
4472     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4474        ) {
4475         if (float64_is_signaling_nan(a, status)
4476          || float64_is_signaling_nan(b, status)) {
4477             float_raise(float_flag_invalid, status);
4478         }
4479         return 0;
4480     }
4481     aSign = extractFloat64Sign( a );
4482     bSign = extractFloat64Sign( b );
4483     av = float64_val(a);
4484     bv = float64_val(b);
4485     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4486     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4487 
4488 }
4489 
4490 /*----------------------------------------------------------------------------
4491 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4492 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4493 | comparison is performed according to the IEC/IEEE Standard for Binary
4494 | Floating-Point Arithmetic.
4495 *----------------------------------------------------------------------------*/
4496 
4497 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4498 {
4499     a = float64_squash_input_denormal(a, status);
4500     b = float64_squash_input_denormal(b, status);
4501 
4502     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4503          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4504        ) {
4505         if (float64_is_signaling_nan(a, status)
4506          || float64_is_signaling_nan(b, status)) {
4507             float_raise(float_flag_invalid, status);
4508         }
4509         return 1;
4510     }
4511     return 0;
4512 }
4513 
4514 /*----------------------------------------------------------------------------
4515 | Returns the result of converting the extended double-precision floating-
4516 | point value `a' to the 32-bit two's complement integer format.  The
4517 | conversion is performed according to the IEC/IEEE Standard for Binary
4518 | Floating-Point Arithmetic---which means in particular that the conversion
4519 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4520 | largest positive integer is returned.  Otherwise, if the conversion
4521 | overflows, the largest integer with the same sign as `a' is returned.
4522 *----------------------------------------------------------------------------*/
4523 
4524 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4525 {
4526     flag aSign;
4527     int32_t aExp, shiftCount;
4528     uint64_t aSig;
4529 
4530     if (floatx80_invalid_encoding(a)) {
4531         float_raise(float_flag_invalid, status);
4532         return 1 << 31;
4533     }
4534     aSig = extractFloatx80Frac( a );
4535     aExp = extractFloatx80Exp( a );
4536     aSign = extractFloatx80Sign( a );
4537     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4538     shiftCount = 0x4037 - aExp;
4539     if ( shiftCount <= 0 ) shiftCount = 1;
4540     shift64RightJamming( aSig, shiftCount, &aSig );
4541     return roundAndPackInt32(aSign, aSig, status);
4542 
4543 }
4544 
4545 /*----------------------------------------------------------------------------
4546 | Returns the result of converting the extended double-precision floating-
4547 | point value `a' to the 32-bit two's complement integer format.  The
4548 | conversion is performed according to the IEC/IEEE Standard for Binary
4549 | Floating-Point Arithmetic, except that the conversion is always rounded
4550 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4551 | Otherwise, if the conversion overflows, the largest integer with the same
4552 | sign as `a' is returned.
4553 *----------------------------------------------------------------------------*/
4554 
4555 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4556 {
4557     flag aSign;
4558     int32_t aExp, shiftCount;
4559     uint64_t aSig, savedASig;
4560     int32_t z;
4561 
4562     if (floatx80_invalid_encoding(a)) {
4563         float_raise(float_flag_invalid, status);
4564         return 1 << 31;
4565     }
4566     aSig = extractFloatx80Frac( a );
4567     aExp = extractFloatx80Exp( a );
4568     aSign = extractFloatx80Sign( a );
4569     if ( 0x401E < aExp ) {
4570         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4571         goto invalid;
4572     }
4573     else if ( aExp < 0x3FFF ) {
4574         if (aExp || aSig) {
4575             status->float_exception_flags |= float_flag_inexact;
4576         }
4577         return 0;
4578     }
4579     shiftCount = 0x403E - aExp;
4580     savedASig = aSig;
4581     aSig >>= shiftCount;
4582     z = aSig;
4583     if ( aSign ) z = - z;
4584     if ( ( z < 0 ) ^ aSign ) {
4585  invalid:
4586         float_raise(float_flag_invalid, status);
4587         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4588     }
4589     if ( ( aSig<<shiftCount ) != savedASig ) {
4590         status->float_exception_flags |= float_flag_inexact;
4591     }
4592     return z;
4593 
4594 }
4595 
4596 /*----------------------------------------------------------------------------
4597 | Returns the result of converting the extended double-precision floating-
4598 | point value `a' to the 64-bit two's complement integer format.  The
4599 | conversion is performed according to the IEC/IEEE Standard for Binary
4600 | Floating-Point Arithmetic---which means in particular that the conversion
4601 | is rounded according to the current rounding mode.  If `a' is a NaN,
4602 | the largest positive integer is returned.  Otherwise, if the conversion
4603 | overflows, the largest integer with the same sign as `a' is returned.
4604 *----------------------------------------------------------------------------*/
4605 
4606 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4607 {
4608     flag aSign;
4609     int32_t aExp, shiftCount;
4610     uint64_t aSig, aSigExtra;
4611 
4612     if (floatx80_invalid_encoding(a)) {
4613         float_raise(float_flag_invalid, status);
4614         return 1ULL << 63;
4615     }
4616     aSig = extractFloatx80Frac( a );
4617     aExp = extractFloatx80Exp( a );
4618     aSign = extractFloatx80Sign( a );
4619     shiftCount = 0x403E - aExp;
4620     if ( shiftCount <= 0 ) {
4621         if ( shiftCount ) {
4622             float_raise(float_flag_invalid, status);
4623             if (    ! aSign
4624                  || (    ( aExp == 0x7FFF )
4625                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4626                ) {
4627                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4628             }
4629             return (int64_t) LIT64( 0x8000000000000000 );
4630         }
4631         aSigExtra = 0;
4632     }
4633     else {
4634         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4635     }
4636     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4637 
4638 }
4639 
4640 /*----------------------------------------------------------------------------
4641 | Returns the result of converting the extended double-precision floating-
4642 | point value `a' to the 64-bit two's complement integer format.  The
4643 | conversion is performed according to the IEC/IEEE Standard for Binary
4644 | Floating-Point Arithmetic, except that the conversion is always rounded
4645 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4646 | Otherwise, if the conversion overflows, the largest integer with the same
4647 | sign as `a' is returned.
4648 *----------------------------------------------------------------------------*/
4649 
4650 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4651 {
4652     flag aSign;
4653     int32_t aExp, shiftCount;
4654     uint64_t aSig;
4655     int64_t z;
4656 
4657     if (floatx80_invalid_encoding(a)) {
4658         float_raise(float_flag_invalid, status);
4659         return 1ULL << 63;
4660     }
4661     aSig = extractFloatx80Frac( a );
4662     aExp = extractFloatx80Exp( a );
4663     aSign = extractFloatx80Sign( a );
4664     shiftCount = aExp - 0x403E;
4665     if ( 0 <= shiftCount ) {
4666         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4667         if ( ( a.high != 0xC03E ) || aSig ) {
4668             float_raise(float_flag_invalid, status);
4669             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4670                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4671             }
4672         }
4673         return (int64_t) LIT64( 0x8000000000000000 );
4674     }
4675     else if ( aExp < 0x3FFF ) {
4676         if (aExp | aSig) {
4677             status->float_exception_flags |= float_flag_inexact;
4678         }
4679         return 0;
4680     }
4681     z = aSig>>( - shiftCount );
4682     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4683         status->float_exception_flags |= float_flag_inexact;
4684     }
4685     if ( aSign ) z = - z;
4686     return z;
4687 
4688 }
4689 
4690 /*----------------------------------------------------------------------------
4691 | Returns the result of converting the extended double-precision floating-
4692 | point value `a' to the single-precision floating-point format.  The
4693 | conversion is performed according to the IEC/IEEE Standard for Binary
4694 | Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4696 
4697 float32 floatx80_to_float32(floatx80 a, float_status *status)
4698 {
4699     flag aSign;
4700     int32_t aExp;
4701     uint64_t aSig;
4702 
4703     if (floatx80_invalid_encoding(a)) {
4704         float_raise(float_flag_invalid, status);
4705         return float32_default_nan(status);
4706     }
4707     aSig = extractFloatx80Frac( a );
4708     aExp = extractFloatx80Exp( a );
4709     aSign = extractFloatx80Sign( a );
4710     if ( aExp == 0x7FFF ) {
4711         if ( (uint64_t) ( aSig<<1 ) ) {
4712             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4713         }
4714         return packFloat32( aSign, 0xFF, 0 );
4715     }
4716     shift64RightJamming( aSig, 33, &aSig );
4717     if ( aExp || aSig ) aExp -= 0x3F81;
4718     return roundAndPackFloat32(aSign, aExp, aSig, status);
4719 
4720 }
4721 
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the extended double-precision floating-
4724 | point value `a' to the double-precision floating-point format.  The
4725 | conversion is performed according to the IEC/IEEE Standard for Binary
4726 | Floating-Point Arithmetic.
4727 *----------------------------------------------------------------------------*/
4728 
4729 float64 floatx80_to_float64(floatx80 a, float_status *status)
4730 {
4731     flag aSign;
4732     int32_t aExp;
4733     uint64_t aSig, zSig;
4734 
4735     if (floatx80_invalid_encoding(a)) {
4736         float_raise(float_flag_invalid, status);
4737         return float64_default_nan(status);
4738     }
4739     aSig = extractFloatx80Frac( a );
4740     aExp = extractFloatx80Exp( a );
4741     aSign = extractFloatx80Sign( a );
4742     if ( aExp == 0x7FFF ) {
4743         if ( (uint64_t) ( aSig<<1 ) ) {
4744             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4745         }
4746         return packFloat64( aSign, 0x7FF, 0 );
4747     }
4748     shift64RightJamming( aSig, 1, &zSig );
4749     if ( aExp || aSig ) aExp -= 0x3C01;
4750     return roundAndPackFloat64(aSign, aExp, zSig, status);
4751 
4752 }
4753 
4754 /*----------------------------------------------------------------------------
4755 | Returns the result of converting the extended double-precision floating-
4756 | point value `a' to the quadruple-precision floating-point format.  The
4757 | conversion is performed according to the IEC/IEEE Standard for Binary
4758 | Floating-Point Arithmetic.
4759 *----------------------------------------------------------------------------*/
4760 
4761 float128 floatx80_to_float128(floatx80 a, float_status *status)
4762 {
4763     flag aSign;
4764     int aExp;
4765     uint64_t aSig, zSig0, zSig1;
4766 
4767     if (floatx80_invalid_encoding(a)) {
4768         float_raise(float_flag_invalid, status);
4769         return float128_default_nan(status);
4770     }
4771     aSig = extractFloatx80Frac( a );
4772     aExp = extractFloatx80Exp( a );
4773     aSign = extractFloatx80Sign( a );
4774     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4775         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4776     }
4777     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4778     return packFloat128( aSign, aExp, zSig0, zSig1 );
4779 
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Rounds the extended double-precision floating-point value `a'
4784 | to the precision provided by floatx80_rounding_precision and returns the
4785 | result as an extended double-precision floating-point value.
4786 | The operation is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic.
4788 *----------------------------------------------------------------------------*/
4789 
4790 floatx80 floatx80_round(floatx80 a, float_status *status)
4791 {
4792     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4793                                 extractFloatx80Sign(a),
4794                                 extractFloatx80Exp(a),
4795                                 extractFloatx80Frac(a), 0, status);
4796 }
4797 
4798 /*----------------------------------------------------------------------------
4799 | Rounds the extended double-precision floating-point value `a' to an integer,
4800 | and returns the result as an extended quadruple-precision floating-point
4801 | value.  The operation is performed according to the IEC/IEEE Standard for
4802 | Binary Floating-Point Arithmetic.
4803 *----------------------------------------------------------------------------*/
4804 
4805 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4806 {
4807     flag aSign;
4808     int32_t aExp;
4809     uint64_t lastBitMask, roundBitsMask;
4810     floatx80 z;
4811 
4812     if (floatx80_invalid_encoding(a)) {
4813         float_raise(float_flag_invalid, status);
4814         return floatx80_default_nan(status);
4815     }
4816     aExp = extractFloatx80Exp( a );
4817     if ( 0x403E <= aExp ) {
4818         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4819             return propagateFloatx80NaN(a, a, status);
4820         }
4821         return a;
4822     }
4823     if ( aExp < 0x3FFF ) {
4824         if (    ( aExp == 0 )
4825              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4826             return a;
4827         }
4828         status->float_exception_flags |= float_flag_inexact;
4829         aSign = extractFloatx80Sign( a );
4830         switch (status->float_rounding_mode) {
4831          case float_round_nearest_even:
4832             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4833                ) {
4834                 return
4835                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4836             }
4837             break;
4838         case float_round_ties_away:
4839             if (aExp == 0x3FFE) {
4840                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4841             }
4842             break;
4843          case float_round_down:
4844             return
4845                   aSign ?
4846                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4847                 : packFloatx80( 0, 0, 0 );
4848          case float_round_up:
4849             return
4850                   aSign ? packFloatx80( 1, 0, 0 )
4851                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4852         }
4853         return packFloatx80( aSign, 0, 0 );
4854     }
4855     lastBitMask = 1;
4856     lastBitMask <<= 0x403E - aExp;
4857     roundBitsMask = lastBitMask - 1;
4858     z = a;
4859     switch (status->float_rounding_mode) {
4860     case float_round_nearest_even:
4861         z.low += lastBitMask>>1;
4862         if ((z.low & roundBitsMask) == 0) {
4863             z.low &= ~lastBitMask;
4864         }
4865         break;
4866     case float_round_ties_away:
4867         z.low += lastBitMask >> 1;
4868         break;
4869     case float_round_to_zero:
4870         break;
4871     case float_round_up:
4872         if (!extractFloatx80Sign(z)) {
4873             z.low += roundBitsMask;
4874         }
4875         break;
4876     case float_round_down:
4877         if (extractFloatx80Sign(z)) {
4878             z.low += roundBitsMask;
4879         }
4880         break;
4881     default:
4882         abort();
4883     }
4884     z.low &= ~ roundBitsMask;
4885     if ( z.low == 0 ) {
4886         ++z.high;
4887         z.low = LIT64( 0x8000000000000000 );
4888     }
4889     if (z.low != a.low) {
4890         status->float_exception_flags |= float_flag_inexact;
4891     }
4892     return z;
4893 
4894 }
4895 
4896 /*----------------------------------------------------------------------------
4897 | Returns the result of adding the absolute values of the extended double-
4898 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4899 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4900 | The addition is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic.
4902 *----------------------------------------------------------------------------*/
4903 
4904 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4905                                 float_status *status)
4906 {
4907     int32_t aExp, bExp, zExp;
4908     uint64_t aSig, bSig, zSig0, zSig1;
4909     int32_t expDiff;
4910 
4911     aSig = extractFloatx80Frac( a );
4912     aExp = extractFloatx80Exp( a );
4913     bSig = extractFloatx80Frac( b );
4914     bExp = extractFloatx80Exp( b );
4915     expDiff = aExp - bExp;
4916     if ( 0 < expDiff ) {
4917         if ( aExp == 0x7FFF ) {
4918             if ((uint64_t)(aSig << 1)) {
4919                 return propagateFloatx80NaN(a, b, status);
4920             }
4921             return a;
4922         }
4923         if ( bExp == 0 ) --expDiff;
4924         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4925         zExp = aExp;
4926     }
4927     else if ( expDiff < 0 ) {
4928         if ( bExp == 0x7FFF ) {
4929             if ((uint64_t)(bSig << 1)) {
4930                 return propagateFloatx80NaN(a, b, status);
4931             }
4932             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4933         }
4934         if ( aExp == 0 ) ++expDiff;
4935         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4936         zExp = bExp;
4937     }
4938     else {
4939         if ( aExp == 0x7FFF ) {
4940             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4941                 return propagateFloatx80NaN(a, b, status);
4942             }
4943             return a;
4944         }
4945         zSig1 = 0;
4946         zSig0 = aSig + bSig;
4947         if ( aExp == 0 ) {
4948             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4949             goto roundAndPack;
4950         }
4951         zExp = aExp;
4952         goto shiftRight1;
4953     }
4954     zSig0 = aSig + bSig;
4955     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4956  shiftRight1:
4957     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4958     zSig0 |= LIT64( 0x8000000000000000 );
4959     ++zExp;
4960  roundAndPack:
4961     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4962                                 zSign, zExp, zSig0, zSig1, status);
4963 }
4964 
4965 /*----------------------------------------------------------------------------
4966 | Returns the result of subtracting the absolute values of the extended
4967 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4968 | difference is negated before being returned.  `zSign' is ignored if the
4969 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4970 | Standard for Binary Floating-Point Arithmetic.
4971 *----------------------------------------------------------------------------*/
4972 
4973 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4974                                 float_status *status)
4975 {
4976     int32_t aExp, bExp, zExp;
4977     uint64_t aSig, bSig, zSig0, zSig1;
4978     int32_t expDiff;
4979 
4980     aSig = extractFloatx80Frac( a );
4981     aExp = extractFloatx80Exp( a );
4982     bSig = extractFloatx80Frac( b );
4983     bExp = extractFloatx80Exp( b );
4984     expDiff = aExp - bExp;
4985     if ( 0 < expDiff ) goto aExpBigger;
4986     if ( expDiff < 0 ) goto bExpBigger;
4987     if ( aExp == 0x7FFF ) {
4988         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4989             return propagateFloatx80NaN(a, b, status);
4990         }
4991         float_raise(float_flag_invalid, status);
4992         return floatx80_default_nan(status);
4993     }
4994     if ( aExp == 0 ) {
4995         aExp = 1;
4996         bExp = 1;
4997     }
4998     zSig1 = 0;
4999     if ( bSig < aSig ) goto aBigger;
5000     if ( aSig < bSig ) goto bBigger;
5001     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5002  bExpBigger:
5003     if ( bExp == 0x7FFF ) {
5004         if ((uint64_t)(bSig << 1)) {
5005             return propagateFloatx80NaN(a, b, status);
5006         }
5007         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5008     }
5009     if ( aExp == 0 ) ++expDiff;
5010     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5011  bBigger:
5012     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5013     zExp = bExp;
5014     zSign ^= 1;
5015     goto normalizeRoundAndPack;
5016  aExpBigger:
5017     if ( aExp == 0x7FFF ) {
5018         if ((uint64_t)(aSig << 1)) {
5019             return propagateFloatx80NaN(a, b, status);
5020         }
5021         return a;
5022     }
5023     if ( bExp == 0 ) --expDiff;
5024     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5025  aBigger:
5026     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5027     zExp = aExp;
5028  normalizeRoundAndPack:
5029     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5030                                          zSign, zExp, zSig0, zSig1, status);
5031 }
5032 
5033 /*----------------------------------------------------------------------------
5034 | Returns the result of adding the extended double-precision floating-point
5035 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5036 | Standard for Binary Floating-Point Arithmetic.
5037 *----------------------------------------------------------------------------*/
5038 
5039 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5040 {
5041     flag aSign, bSign;
5042 
5043     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5044         float_raise(float_flag_invalid, status);
5045         return floatx80_default_nan(status);
5046     }
5047     aSign = extractFloatx80Sign( a );
5048     bSign = extractFloatx80Sign( b );
5049     if ( aSign == bSign ) {
5050         return addFloatx80Sigs(a, b, aSign, status);
5051     }
5052     else {
5053         return subFloatx80Sigs(a, b, aSign, status);
5054     }
5055 
5056 }
5057 
5058 /*----------------------------------------------------------------------------
5059 | Returns the result of subtracting the extended double-precision floating-
5060 | point values `a' and `b'.  The operation is performed according to the
5061 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063 
5064 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5065 {
5066     flag aSign, bSign;
5067 
5068     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5069         float_raise(float_flag_invalid, status);
5070         return floatx80_default_nan(status);
5071     }
5072     aSign = extractFloatx80Sign( a );
5073     bSign = extractFloatx80Sign( b );
5074     if ( aSign == bSign ) {
5075         return subFloatx80Sigs(a, b, aSign, status);
5076     }
5077     else {
5078         return addFloatx80Sigs(a, b, aSign, status);
5079     }
5080 
5081 }
5082 
5083 /*----------------------------------------------------------------------------
5084 | Returns the result of multiplying the extended double-precision floating-
5085 | point values `a' and `b'.  The operation is performed according to the
5086 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5087 *----------------------------------------------------------------------------*/
5088 
5089 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5090 {
5091     flag aSign, bSign, zSign;
5092     int32_t aExp, bExp, zExp;
5093     uint64_t aSig, bSig, zSig0, zSig1;
5094 
5095     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5096         float_raise(float_flag_invalid, status);
5097         return floatx80_default_nan(status);
5098     }
5099     aSig = extractFloatx80Frac( a );
5100     aExp = extractFloatx80Exp( a );
5101     aSign = extractFloatx80Sign( a );
5102     bSig = extractFloatx80Frac( b );
5103     bExp = extractFloatx80Exp( b );
5104     bSign = extractFloatx80Sign( b );
5105     zSign = aSign ^ bSign;
5106     if ( aExp == 0x7FFF ) {
5107         if (    (uint64_t) ( aSig<<1 )
5108              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5109             return propagateFloatx80NaN(a, b, status);
5110         }
5111         if ( ( bExp | bSig ) == 0 ) goto invalid;
5112         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5113     }
5114     if ( bExp == 0x7FFF ) {
5115         if ((uint64_t)(bSig << 1)) {
5116             return propagateFloatx80NaN(a, b, status);
5117         }
5118         if ( ( aExp | aSig ) == 0 ) {
5119  invalid:
5120             float_raise(float_flag_invalid, status);
5121             return floatx80_default_nan(status);
5122         }
5123         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5124     }
5125     if ( aExp == 0 ) {
5126         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5127         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5128     }
5129     if ( bExp == 0 ) {
5130         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5131         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5132     }
5133     zExp = aExp + bExp - 0x3FFE;
5134     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5135     if ( 0 < (int64_t) zSig0 ) {
5136         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5137         --zExp;
5138     }
5139     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5140                                 zSign, zExp, zSig0, zSig1, status);
5141 }
5142 
5143 /*----------------------------------------------------------------------------
5144 | Returns the result of dividing the extended double-precision floating-point
5145 | value `a' by the corresponding value `b'.  The operation is performed
5146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5147 *----------------------------------------------------------------------------*/
5148 
5149 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5150 {
5151     flag aSign, bSign, zSign;
5152     int32_t aExp, bExp, zExp;
5153     uint64_t aSig, bSig, zSig0, zSig1;
5154     uint64_t rem0, rem1, rem2, term0, term1, term2;
5155 
5156     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5157         float_raise(float_flag_invalid, status);
5158         return floatx80_default_nan(status);
5159     }
5160     aSig = extractFloatx80Frac( a );
5161     aExp = extractFloatx80Exp( a );
5162     aSign = extractFloatx80Sign( a );
5163     bSig = extractFloatx80Frac( b );
5164     bExp = extractFloatx80Exp( b );
5165     bSign = extractFloatx80Sign( b );
5166     zSign = aSign ^ bSign;
5167     if ( aExp == 0x7FFF ) {
5168         if ((uint64_t)(aSig << 1)) {
5169             return propagateFloatx80NaN(a, b, status);
5170         }
5171         if ( bExp == 0x7FFF ) {
5172             if ((uint64_t)(bSig << 1)) {
5173                 return propagateFloatx80NaN(a, b, status);
5174             }
5175             goto invalid;
5176         }
5177         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5178     }
5179     if ( bExp == 0x7FFF ) {
5180         if ((uint64_t)(bSig << 1)) {
5181             return propagateFloatx80NaN(a, b, status);
5182         }
5183         return packFloatx80( zSign, 0, 0 );
5184     }
5185     if ( bExp == 0 ) {
5186         if ( bSig == 0 ) {
5187             if ( ( aExp | aSig ) == 0 ) {
5188  invalid:
5189                 float_raise(float_flag_invalid, status);
5190                 return floatx80_default_nan(status);
5191             }
5192             float_raise(float_flag_divbyzero, status);
5193             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5194         }
5195         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5196     }
5197     if ( aExp == 0 ) {
5198         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5199         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5200     }
5201     zExp = aExp - bExp + 0x3FFE;
5202     rem1 = 0;
5203     if ( bSig <= aSig ) {
5204         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5205         ++zExp;
5206     }
5207     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5208     mul64To128( bSig, zSig0, &term0, &term1 );
5209     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5210     while ( (int64_t) rem0 < 0 ) {
5211         --zSig0;
5212         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5213     }
5214     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5215     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5216         mul64To128( bSig, zSig1, &term1, &term2 );
5217         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5218         while ( (int64_t) rem1 < 0 ) {
5219             --zSig1;
5220             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5221         }
5222         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5223     }
5224     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225                                 zSign, zExp, zSig0, zSig1, status);
5226 }
5227 
5228 /*----------------------------------------------------------------------------
5229 | Returns the remainder of the extended double-precision floating-point value
5230 | `a' with respect to the corresponding value `b'.  The operation is performed
5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232 *----------------------------------------------------------------------------*/
5233 
5234 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5235 {
5236     flag aSign, zSign;
5237     int32_t aExp, bExp, expDiff;
5238     uint64_t aSig0, aSig1, bSig;
5239     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5240 
5241     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242         float_raise(float_flag_invalid, status);
5243         return floatx80_default_nan(status);
5244     }
5245     aSig0 = extractFloatx80Frac( a );
5246     aExp = extractFloatx80Exp( a );
5247     aSign = extractFloatx80Sign( a );
5248     bSig = extractFloatx80Frac( b );
5249     bExp = extractFloatx80Exp( b );
5250     if ( aExp == 0x7FFF ) {
5251         if (    (uint64_t) ( aSig0<<1 )
5252              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5253             return propagateFloatx80NaN(a, b, status);
5254         }
5255         goto invalid;
5256     }
5257     if ( bExp == 0x7FFF ) {
5258         if ((uint64_t)(bSig << 1)) {
5259             return propagateFloatx80NaN(a, b, status);
5260         }
5261         return a;
5262     }
5263     if ( bExp == 0 ) {
5264         if ( bSig == 0 ) {
5265  invalid:
5266             float_raise(float_flag_invalid, status);
5267             return floatx80_default_nan(status);
5268         }
5269         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5270     }
5271     if ( aExp == 0 ) {
5272         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5273         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5274     }
5275     bSig |= LIT64( 0x8000000000000000 );
5276     zSign = aSign;
5277     expDiff = aExp - bExp;
5278     aSig1 = 0;
5279     if ( expDiff < 0 ) {
5280         if ( expDiff < -1 ) return a;
5281         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5282         expDiff = 0;
5283     }
5284     q = ( bSig <= aSig0 );
5285     if ( q ) aSig0 -= bSig;
5286     expDiff -= 64;
5287     while ( 0 < expDiff ) {
5288         q = estimateDiv128To64( aSig0, aSig1, bSig );
5289         q = ( 2 < q ) ? q - 2 : 0;
5290         mul64To128( bSig, q, &term0, &term1 );
5291         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5292         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5293         expDiff -= 62;
5294     }
5295     expDiff += 64;
5296     if ( 0 < expDiff ) {
5297         q = estimateDiv128To64( aSig0, aSig1, bSig );
5298         q = ( 2 < q ) ? q - 2 : 0;
5299         q >>= 64 - expDiff;
5300         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5301         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5302         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5303         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5304             ++q;
5305             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5306         }
5307     }
5308     else {
5309         term1 = 0;
5310         term0 = bSig;
5311     }
5312     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5313     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5314          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5315               && ( q & 1 ) )
5316        ) {
5317         aSig0 = alternateASig0;
5318         aSig1 = alternateASig1;
5319         zSign = ! zSign;
5320     }
5321     return
5322         normalizeRoundAndPackFloatx80(
5323             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5324 
5325 }
5326 
5327 /*----------------------------------------------------------------------------
5328 | Returns the square root of the extended double-precision floating-point
5329 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5330 | for Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5332 
5333 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5334 {
5335     flag aSign;
5336     int32_t aExp, zExp;
5337     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5338     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5339 
5340     if (floatx80_invalid_encoding(a)) {
5341         float_raise(float_flag_invalid, status);
5342         return floatx80_default_nan(status);
5343     }
5344     aSig0 = extractFloatx80Frac( a );
5345     aExp = extractFloatx80Exp( a );
5346     aSign = extractFloatx80Sign( a );
5347     if ( aExp == 0x7FFF ) {
5348         if ((uint64_t)(aSig0 << 1)) {
5349             return propagateFloatx80NaN(a, a, status);
5350         }
5351         if ( ! aSign ) return a;
5352         goto invalid;
5353     }
5354     if ( aSign ) {
5355         if ( ( aExp | aSig0 ) == 0 ) return a;
5356  invalid:
5357         float_raise(float_flag_invalid, status);
5358         return floatx80_default_nan(status);
5359     }
5360     if ( aExp == 0 ) {
5361         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5362         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5363     }
5364     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5365     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5366     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5367     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5368     doubleZSig0 = zSig0<<1;
5369     mul64To128( zSig0, zSig0, &term0, &term1 );
5370     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5371     while ( (int64_t) rem0 < 0 ) {
5372         --zSig0;
5373         doubleZSig0 -= 2;
5374         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5375     }
5376     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5377     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5378         if ( zSig1 == 0 ) zSig1 = 1;
5379         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5380         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5381         mul64To128( zSig1, zSig1, &term2, &term3 );
5382         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5383         while ( (int64_t) rem1 < 0 ) {
5384             --zSig1;
5385             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5386             term3 |= 1;
5387             term2 |= doubleZSig0;
5388             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5389         }
5390         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5391     }
5392     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5393     zSig0 |= doubleZSig0;
5394     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5395                                 0, zExp, zSig0, zSig1, status);
5396 }
5397 
5398 /*----------------------------------------------------------------------------
5399 | Returns 1 if the extended double-precision floating-point value `a' is equal
5400 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5401 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5403 *----------------------------------------------------------------------------*/
5404 
5405 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5406 {
5407 
5408     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5409         || (extractFloatx80Exp(a) == 0x7FFF
5410             && (uint64_t) (extractFloatx80Frac(a) << 1))
5411         || (extractFloatx80Exp(b) == 0x7FFF
5412             && (uint64_t) (extractFloatx80Frac(b) << 1))
5413        ) {
5414         float_raise(float_flag_invalid, status);
5415         return 0;
5416     }
5417     return
5418            ( a.low == b.low )
5419         && (    ( a.high == b.high )
5420              || (    ( a.low == 0 )
5421                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5422            );
5423 
5424 }
5425 
5426 /*----------------------------------------------------------------------------
5427 | Returns 1 if the extended double-precision floating-point value `a' is
5428 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5429 | invalid exception is raised if either operand is a NaN.  The comparison is
5430 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5431 | Arithmetic.
5432 *----------------------------------------------------------------------------*/
5433 
5434 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5435 {
5436     flag aSign, bSign;
5437 
5438     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5439         || (extractFloatx80Exp(a) == 0x7FFF
5440             && (uint64_t) (extractFloatx80Frac(a) << 1))
5441         || (extractFloatx80Exp(b) == 0x7FFF
5442             && (uint64_t) (extractFloatx80Frac(b) << 1))
5443        ) {
5444         float_raise(float_flag_invalid, status);
5445         return 0;
5446     }
5447     aSign = extractFloatx80Sign( a );
5448     bSign = extractFloatx80Sign( b );
5449     if ( aSign != bSign ) {
5450         return
5451                aSign
5452             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5453                  == 0 );
5454     }
5455     return
5456           aSign ? le128( b.high, b.low, a.high, a.low )
5457         : le128( a.high, a.low, b.high, b.low );
5458 
5459 }
5460 
5461 /*----------------------------------------------------------------------------
5462 | Returns 1 if the extended double-precision floating-point value `a' is
5463 | less than the corresponding value `b', and 0 otherwise.  The invalid
5464 | exception is raised if either operand is a NaN.  The comparison is performed
5465 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5466 *----------------------------------------------------------------------------*/
5467 
5468 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5469 {
5470     flag aSign, bSign;
5471 
5472     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5473         || (extractFloatx80Exp(a) == 0x7FFF
5474             && (uint64_t) (extractFloatx80Frac(a) << 1))
5475         || (extractFloatx80Exp(b) == 0x7FFF
5476             && (uint64_t) (extractFloatx80Frac(b) << 1))
5477        ) {
5478         float_raise(float_flag_invalid, status);
5479         return 0;
5480     }
5481     aSign = extractFloatx80Sign( a );
5482     bSign = extractFloatx80Sign( b );
5483     if ( aSign != bSign ) {
5484         return
5485                aSign
5486             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5487                  != 0 );
5488     }
5489     return
5490           aSign ? lt128( b.high, b.low, a.high, a.low )
5491         : lt128( a.high, a.low, b.high, b.low );
5492 
5493 }
5494 
5495 /*----------------------------------------------------------------------------
5496 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5497 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5498 | either operand is a NaN.   The comparison is performed according to the
5499 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5500 *----------------------------------------------------------------------------*/
5501 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5502 {
5503     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5504         || (extractFloatx80Exp(a) == 0x7FFF
5505             && (uint64_t) (extractFloatx80Frac(a) << 1))
5506         || (extractFloatx80Exp(b) == 0x7FFF
5507             && (uint64_t) (extractFloatx80Frac(b) << 1))
5508        ) {
5509         float_raise(float_flag_invalid, status);
5510         return 1;
5511     }
5512     return 0;
5513 }
5514 
5515 /*----------------------------------------------------------------------------
5516 | Returns 1 if the extended double-precision floating-point value `a' is
5517 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5518 | cause an exception.  The comparison is performed according to the IEC/IEEE
5519 | Standard for Binary Floating-Point Arithmetic.
5520 *----------------------------------------------------------------------------*/
5521 
5522 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5523 {
5524 
5525     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5526         float_raise(float_flag_invalid, status);
5527         return 0;
5528     }
5529     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5530               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5531          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5532               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5533        ) {
5534         if (floatx80_is_signaling_nan(a, status)
5535          || floatx80_is_signaling_nan(b, status)) {
5536             float_raise(float_flag_invalid, status);
5537         }
5538         return 0;
5539     }
5540     return
5541            ( a.low == b.low )
5542         && (    ( a.high == b.high )
5543              || (    ( a.low == 0 )
5544                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5545            );
5546 
5547 }
5548 
5549 /*----------------------------------------------------------------------------
5550 | Returns 1 if the extended double-precision floating-point value `a' is less
5551 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5552 | do not cause an exception.  Otherwise, the comparison is performed according
5553 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5554 *----------------------------------------------------------------------------*/
5555 
5556 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5557 {
5558     flag aSign, bSign;
5559 
5560     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5561         float_raise(float_flag_invalid, status);
5562         return 0;
5563     }
5564     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5565               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5566          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5567               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5568        ) {
5569         if (floatx80_is_signaling_nan(a, status)
5570          || floatx80_is_signaling_nan(b, status)) {
5571             float_raise(float_flag_invalid, status);
5572         }
5573         return 0;
5574     }
5575     aSign = extractFloatx80Sign( a );
5576     bSign = extractFloatx80Sign( b );
5577     if ( aSign != bSign ) {
5578         return
5579                aSign
5580             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5581                  == 0 );
5582     }
5583     return
5584           aSign ? le128( b.high, b.low, a.high, a.low )
5585         : le128( a.high, a.low, b.high, b.low );
5586 
5587 }
5588 
5589 /*----------------------------------------------------------------------------
5590 | Returns 1 if the extended double-precision floating-point value `a' is less
5591 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5592 | an exception.  Otherwise, the comparison is performed according to the
5593 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5594 *----------------------------------------------------------------------------*/
5595 
5596 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5597 {
5598     flag aSign, bSign;
5599 
5600     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5601         float_raise(float_flag_invalid, status);
5602         return 0;
5603     }
5604     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5605               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5606          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5607               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5608        ) {
5609         if (floatx80_is_signaling_nan(a, status)
5610          || floatx80_is_signaling_nan(b, status)) {
5611             float_raise(float_flag_invalid, status);
5612         }
5613         return 0;
5614     }
5615     aSign = extractFloatx80Sign( a );
5616     bSign = extractFloatx80Sign( b );
5617     if ( aSign != bSign ) {
5618         return
5619                aSign
5620             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5621                  != 0 );
5622     }
5623     return
5624           aSign ? lt128( b.high, b.low, a.high, a.low )
5625         : lt128( a.high, a.low, b.high, b.low );
5626 
5627 }
5628 
5629 /*----------------------------------------------------------------------------
5630 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5631 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5632 | The comparison is performed according to the IEC/IEEE Standard for Binary
5633 | Floating-Point Arithmetic.
5634 *----------------------------------------------------------------------------*/
5635 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5636 {
5637     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5638         float_raise(float_flag_invalid, status);
5639         return 1;
5640     }
5641     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5642               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5643          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5644               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5645        ) {
5646         if (floatx80_is_signaling_nan(a, status)
5647          || floatx80_is_signaling_nan(b, status)) {
5648             float_raise(float_flag_invalid, status);
5649         }
5650         return 1;
5651     }
5652     return 0;
5653 }
5654 
5655 /*----------------------------------------------------------------------------
5656 | Returns the result of converting the quadruple-precision floating-point
5657 | value `a' to the 32-bit two's complement integer format.  The conversion
5658 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5659 | Arithmetic---which means in particular that the conversion is rounded
5660 | according to the current rounding mode.  If `a' is a NaN, the largest
5661 | positive integer is returned.  Otherwise, if the conversion overflows, the
5662 | largest integer with the same sign as `a' is returned.
5663 *----------------------------------------------------------------------------*/
5664 
5665 int32_t float128_to_int32(float128 a, float_status *status)
5666 {
5667     flag aSign;
5668     int32_t aExp, shiftCount;
5669     uint64_t aSig0, aSig1;
5670 
5671     aSig1 = extractFloat128Frac1( a );
5672     aSig0 = extractFloat128Frac0( a );
5673     aExp = extractFloat128Exp( a );
5674     aSign = extractFloat128Sign( a );
5675     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5676     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5677     aSig0 |= ( aSig1 != 0 );
5678     shiftCount = 0x4028 - aExp;
5679     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5680     return roundAndPackInt32(aSign, aSig0, status);
5681 
5682 }
5683 
5684 /*----------------------------------------------------------------------------
5685 | Returns the result of converting the quadruple-precision floating-point
5686 | value `a' to the 32-bit two's complement integer format.  The conversion
5687 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5688 | Arithmetic, except that the conversion is always rounded toward zero.  If
5689 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5690 | conversion overflows, the largest integer with the same sign as `a' is
5691 | returned.
5692 *----------------------------------------------------------------------------*/
5693 
5694 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5695 {
5696     flag aSign;
5697     int32_t aExp, shiftCount;
5698     uint64_t aSig0, aSig1, savedASig;
5699     int32_t z;
5700 
5701     aSig1 = extractFloat128Frac1( a );
5702     aSig0 = extractFloat128Frac0( a );
5703     aExp = extractFloat128Exp( a );
5704     aSign = extractFloat128Sign( a );
5705     aSig0 |= ( aSig1 != 0 );
5706     if ( 0x401E < aExp ) {
5707         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5708         goto invalid;
5709     }
5710     else if ( aExp < 0x3FFF ) {
5711         if (aExp || aSig0) {
5712             status->float_exception_flags |= float_flag_inexact;
5713         }
5714         return 0;
5715     }
5716     aSig0 |= LIT64( 0x0001000000000000 );
5717     shiftCount = 0x402F - aExp;
5718     savedASig = aSig0;
5719     aSig0 >>= shiftCount;
5720     z = aSig0;
5721     if ( aSign ) z = - z;
5722     if ( ( z < 0 ) ^ aSign ) {
5723  invalid:
5724         float_raise(float_flag_invalid, status);
5725         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5726     }
5727     if ( ( aSig0<<shiftCount ) != savedASig ) {
5728         status->float_exception_flags |= float_flag_inexact;
5729     }
5730     return z;
5731 
5732 }
5733 
5734 /*----------------------------------------------------------------------------
5735 | Returns the result of converting the quadruple-precision floating-point
5736 | value `a' to the 64-bit two's complement integer format.  The conversion
5737 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5738 | Arithmetic---which means in particular that the conversion is rounded
5739 | according to the current rounding mode.  If `a' is a NaN, the largest
5740 | positive integer is returned.  Otherwise, if the conversion overflows, the
5741 | largest integer with the same sign as `a' is returned.
5742 *----------------------------------------------------------------------------*/
5743 
5744 int64_t float128_to_int64(float128 a, float_status *status)
5745 {
5746     flag aSign;
5747     int32_t aExp, shiftCount;
5748     uint64_t aSig0, aSig1;
5749 
5750     aSig1 = extractFloat128Frac1( a );
5751     aSig0 = extractFloat128Frac0( a );
5752     aExp = extractFloat128Exp( a );
5753     aSign = extractFloat128Sign( a );
5754     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5755     shiftCount = 0x402F - aExp;
5756     if ( shiftCount <= 0 ) {
5757         if ( 0x403E < aExp ) {
5758             float_raise(float_flag_invalid, status);
5759             if (    ! aSign
5760                  || (    ( aExp == 0x7FFF )
5761                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5762                     )
5763                ) {
5764                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5765             }
5766             return (int64_t) LIT64( 0x8000000000000000 );
5767         }
5768         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5769     }
5770     else {
5771         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5772     }
5773     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5774 
5775 }
5776 
5777 /*----------------------------------------------------------------------------
5778 | Returns the result of converting the quadruple-precision floating-point
5779 | value `a' to the 64-bit two's complement integer format.  The conversion
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic, except that the conversion is always rounded toward zero.
5782 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5783 | the conversion overflows, the largest integer with the same sign as `a' is
5784 | returned.
5785 *----------------------------------------------------------------------------*/
5786 
5787 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5788 {
5789     flag aSign;
5790     int32_t aExp, shiftCount;
5791     uint64_t aSig0, aSig1;
5792     int64_t z;
5793 
5794     aSig1 = extractFloat128Frac1( a );
5795     aSig0 = extractFloat128Frac0( a );
5796     aExp = extractFloat128Exp( a );
5797     aSign = extractFloat128Sign( a );
5798     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5799     shiftCount = aExp - 0x402F;
5800     if ( 0 < shiftCount ) {
5801         if ( 0x403E <= aExp ) {
5802             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5803             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5804                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5805                 if (aSig1) {
5806                     status->float_exception_flags |= float_flag_inexact;
5807                 }
5808             }
5809             else {
5810                 float_raise(float_flag_invalid, status);
5811                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5812                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5813                 }
5814             }
5815             return (int64_t) LIT64( 0x8000000000000000 );
5816         }
5817         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5818         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5819             status->float_exception_flags |= float_flag_inexact;
5820         }
5821     }
5822     else {
5823         if ( aExp < 0x3FFF ) {
5824             if ( aExp | aSig0 | aSig1 ) {
5825                 status->float_exception_flags |= float_flag_inexact;
5826             }
5827             return 0;
5828         }
5829         z = aSig0>>( - shiftCount );
5830         if (    aSig1
5831              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5832             status->float_exception_flags |= float_flag_inexact;
5833         }
5834     }
5835     if ( aSign ) z = - z;
5836     return z;
5837 
5838 }
5839 
5840 /*----------------------------------------------------------------------------
5841 | Returns the result of converting the quadruple-precision floating-point value
5842 | `a' to the 64-bit unsigned integer format.  The conversion is
5843 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5844 | Arithmetic---which means in particular that the conversion is rounded
5845 | according to the current rounding mode.  If `a' is a NaN, the largest
5846 | positive integer is returned.  If the conversion overflows, the
5847 | largest unsigned integer is returned.  If 'a' is negative, the value is
5848 | rounded and zero is returned; negative values that do not round to zero
5849 | will raise the inexact exception.
5850 *----------------------------------------------------------------------------*/
5851 
5852 uint64_t float128_to_uint64(float128 a, float_status *status)
5853 {
5854     flag aSign;
5855     int aExp;
5856     int shiftCount;
5857     uint64_t aSig0, aSig1;
5858 
5859     aSig0 = extractFloat128Frac0(a);
5860     aSig1 = extractFloat128Frac1(a);
5861     aExp = extractFloat128Exp(a);
5862     aSign = extractFloat128Sign(a);
5863     if (aSign && (aExp > 0x3FFE)) {
5864         float_raise(float_flag_invalid, status);
5865         if (float128_is_any_nan(a)) {
5866             return LIT64(0xFFFFFFFFFFFFFFFF);
5867         } else {
5868             return 0;
5869         }
5870     }
5871     if (aExp) {
5872         aSig0 |= LIT64(0x0001000000000000);
5873     }
5874     shiftCount = 0x402F - aExp;
5875     if (shiftCount <= 0) {
5876         if (0x403E < aExp) {
5877             float_raise(float_flag_invalid, status);
5878             return LIT64(0xFFFFFFFFFFFFFFFF);
5879         }
5880         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5881     } else {
5882         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5883     }
5884     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5885 }
5886 
5887 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5888 {
5889     uint64_t v;
5890     signed char current_rounding_mode = status->float_rounding_mode;
5891 
5892     set_float_rounding_mode(float_round_to_zero, status);
5893     v = float128_to_uint64(a, status);
5894     set_float_rounding_mode(current_rounding_mode, status);
5895 
5896     return v;
5897 }
5898 
5899 /*----------------------------------------------------------------------------
5900 | Returns the result of converting the quadruple-precision floating-point
5901 | value `a' to the 32-bit unsigned integer format.  The conversion
5902 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5903 | Arithmetic except that the conversion is always rounded toward zero.
5904 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5905 | if the conversion overflows, the largest unsigned integer is returned.
5906 | If 'a' is negative, the value is rounded and zero is returned; negative
5907 | values that do not round to zero will raise the inexact exception.
5908 *----------------------------------------------------------------------------*/
5909 
5910 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5911 {
5912     uint64_t v;
5913     uint32_t res;
5914     int old_exc_flags = get_float_exception_flags(status);
5915 
5916     v = float128_to_uint64_round_to_zero(a, status);
5917     if (v > 0xffffffff) {
5918         res = 0xffffffff;
5919     } else {
5920         return v;
5921     }
5922     set_float_exception_flags(old_exc_flags, status);
5923     float_raise(float_flag_invalid, status);
5924     return res;
5925 }
5926 
5927 /*----------------------------------------------------------------------------
5928 | Returns the result of converting the quadruple-precision floating-point
5929 | value `a' to the single-precision floating-point format.  The conversion
5930 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5931 | Arithmetic.
5932 *----------------------------------------------------------------------------*/
5933 
5934 float32 float128_to_float32(float128 a, float_status *status)
5935 {
5936     flag aSign;
5937     int32_t aExp;
5938     uint64_t aSig0, aSig1;
5939     uint32_t zSig;
5940 
5941     aSig1 = extractFloat128Frac1( a );
5942     aSig0 = extractFloat128Frac0( a );
5943     aExp = extractFloat128Exp( a );
5944     aSign = extractFloat128Sign( a );
5945     if ( aExp == 0x7FFF ) {
5946         if ( aSig0 | aSig1 ) {
5947             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5948         }
5949         return packFloat32( aSign, 0xFF, 0 );
5950     }
5951     aSig0 |= ( aSig1 != 0 );
5952     shift64RightJamming( aSig0, 18, &aSig0 );
5953     zSig = aSig0;
5954     if ( aExp || zSig ) {
5955         zSig |= 0x40000000;
5956         aExp -= 0x3F81;
5957     }
5958     return roundAndPackFloat32(aSign, aExp, zSig, status);
5959 
5960 }
5961 
5962 /*----------------------------------------------------------------------------
5963 | Returns the result of converting the quadruple-precision floating-point
5964 | value `a' to the double-precision floating-point format.  The conversion
5965 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5966 | Arithmetic.
5967 *----------------------------------------------------------------------------*/
5968 
5969 float64 float128_to_float64(float128 a, float_status *status)
5970 {
5971     flag aSign;
5972     int32_t aExp;
5973     uint64_t aSig0, aSig1;
5974 
5975     aSig1 = extractFloat128Frac1( a );
5976     aSig0 = extractFloat128Frac0( a );
5977     aExp = extractFloat128Exp( a );
5978     aSign = extractFloat128Sign( a );
5979     if ( aExp == 0x7FFF ) {
5980         if ( aSig0 | aSig1 ) {
5981             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5982         }
5983         return packFloat64( aSign, 0x7FF, 0 );
5984     }
5985     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5986     aSig0 |= ( aSig1 != 0 );
5987     if ( aExp || aSig0 ) {
5988         aSig0 |= LIT64( 0x4000000000000000 );
5989         aExp -= 0x3C01;
5990     }
5991     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5992 
5993 }
5994 
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of converting the quadruple-precision floating-point
5997 | value `a' to the extended double-precision floating-point format.  The
5998 | conversion is performed according to the IEC/IEEE Standard for Binary
5999 | Floating-Point Arithmetic.
6000 *----------------------------------------------------------------------------*/
6001 
6002 floatx80 float128_to_floatx80(float128 a, float_status *status)
6003 {
6004     flag aSign;
6005     int32_t aExp;
6006     uint64_t aSig0, aSig1;
6007 
6008     aSig1 = extractFloat128Frac1( a );
6009     aSig0 = extractFloat128Frac0( a );
6010     aExp = extractFloat128Exp( a );
6011     aSign = extractFloat128Sign( a );
6012     if ( aExp == 0x7FFF ) {
6013         if ( aSig0 | aSig1 ) {
6014             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6015         }
6016         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6017     }
6018     if ( aExp == 0 ) {
6019         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6020         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6021     }
6022     else {
6023         aSig0 |= LIT64( 0x0001000000000000 );
6024     }
6025     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6026     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6027 
6028 }
6029 
6030 /*----------------------------------------------------------------------------
6031 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6032 | returns the result as a quadruple-precision floating-point value.  The
6033 | operation is performed according to the IEC/IEEE Standard for Binary
6034 | Floating-Point Arithmetic.
6035 *----------------------------------------------------------------------------*/
6036 
6037 float128 float128_round_to_int(float128 a, float_status *status)
6038 {
6039     flag aSign;
6040     int32_t aExp;
6041     uint64_t lastBitMask, roundBitsMask;
6042     float128 z;
6043 
6044     aExp = extractFloat128Exp( a );
6045     if ( 0x402F <= aExp ) {
6046         if ( 0x406F <= aExp ) {
6047             if (    ( aExp == 0x7FFF )
6048                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6049                ) {
6050                 return propagateFloat128NaN(a, a, status);
6051             }
6052             return a;
6053         }
6054         lastBitMask = 1;
6055         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6056         roundBitsMask = lastBitMask - 1;
6057         z = a;
6058         switch (status->float_rounding_mode) {
6059         case float_round_nearest_even:
6060             if ( lastBitMask ) {
6061                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6062                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6063             }
6064             else {
6065                 if ( (int64_t) z.low < 0 ) {
6066                     ++z.high;
6067                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6068                 }
6069             }
6070             break;
6071         case float_round_ties_away:
6072             if (lastBitMask) {
6073                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6074             } else {
6075                 if ((int64_t) z.low < 0) {
6076                     ++z.high;
6077                 }
6078             }
6079             break;
6080         case float_round_to_zero:
6081             break;
6082         case float_round_up:
6083             if (!extractFloat128Sign(z)) {
6084                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6085             }
6086             break;
6087         case float_round_down:
6088             if (extractFloat128Sign(z)) {
6089                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6090             }
6091             break;
6092         default:
6093             abort();
6094         }
6095         z.low &= ~ roundBitsMask;
6096     }
6097     else {
6098         if ( aExp < 0x3FFF ) {
6099             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6100             status->float_exception_flags |= float_flag_inexact;
6101             aSign = extractFloat128Sign( a );
6102             switch (status->float_rounding_mode) {
6103              case float_round_nearest_even:
6104                 if (    ( aExp == 0x3FFE )
6105                      && (   extractFloat128Frac0( a )
6106                           | extractFloat128Frac1( a ) )
6107                    ) {
6108                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6109                 }
6110                 break;
6111             case float_round_ties_away:
6112                 if (aExp == 0x3FFE) {
6113                     return packFloat128(aSign, 0x3FFF, 0, 0);
6114                 }
6115                 break;
6116              case float_round_down:
6117                 return
6118                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6119                     : packFloat128( 0, 0, 0, 0 );
6120              case float_round_up:
6121                 return
6122                       aSign ? packFloat128( 1, 0, 0, 0 )
6123                     : packFloat128( 0, 0x3FFF, 0, 0 );
6124             }
6125             return packFloat128( aSign, 0, 0, 0 );
6126         }
6127         lastBitMask = 1;
6128         lastBitMask <<= 0x402F - aExp;
6129         roundBitsMask = lastBitMask - 1;
6130         z.low = 0;
6131         z.high = a.high;
6132         switch (status->float_rounding_mode) {
6133         case float_round_nearest_even:
6134             z.high += lastBitMask>>1;
6135             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6136                 z.high &= ~ lastBitMask;
6137             }
6138             break;
6139         case float_round_ties_away:
6140             z.high += lastBitMask>>1;
6141             break;
6142         case float_round_to_zero:
6143             break;
6144         case float_round_up:
6145             if (!extractFloat128Sign(z)) {
6146                 z.high |= ( a.low != 0 );
6147                 z.high += roundBitsMask;
6148             }
6149             break;
6150         case float_round_down:
6151             if (extractFloat128Sign(z)) {
6152                 z.high |= (a.low != 0);
6153                 z.high += roundBitsMask;
6154             }
6155             break;
6156         default:
6157             abort();
6158         }
6159         z.high &= ~ roundBitsMask;
6160     }
6161     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6162         status->float_exception_flags |= float_flag_inexact;
6163     }
6164     return z;
6165 
6166 }
6167 
6168 /*----------------------------------------------------------------------------
6169 | Returns the result of adding the absolute values of the quadruple-precision
6170 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6171 | before being returned.  `zSign' is ignored if the result is a NaN.
6172 | The addition is performed according to the IEC/IEEE Standard for Binary
6173 | Floating-Point Arithmetic.
6174 *----------------------------------------------------------------------------*/
6175 
6176 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6177                                 float_status *status)
6178 {
6179     int32_t aExp, bExp, zExp;
6180     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6181     int32_t expDiff;
6182 
6183     aSig1 = extractFloat128Frac1( a );
6184     aSig0 = extractFloat128Frac0( a );
6185     aExp = extractFloat128Exp( a );
6186     bSig1 = extractFloat128Frac1( b );
6187     bSig0 = extractFloat128Frac0( b );
6188     bExp = extractFloat128Exp( b );
6189     expDiff = aExp - bExp;
6190     if ( 0 < expDiff ) {
6191         if ( aExp == 0x7FFF ) {
6192             if (aSig0 | aSig1) {
6193                 return propagateFloat128NaN(a, b, status);
6194             }
6195             return a;
6196         }
6197         if ( bExp == 0 ) {
6198             --expDiff;
6199         }
6200         else {
6201             bSig0 |= LIT64( 0x0001000000000000 );
6202         }
6203         shift128ExtraRightJamming(
6204             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6205         zExp = aExp;
6206     }
6207     else if ( expDiff < 0 ) {
6208         if ( bExp == 0x7FFF ) {
6209             if (bSig0 | bSig1) {
6210                 return propagateFloat128NaN(a, b, status);
6211             }
6212             return packFloat128( zSign, 0x7FFF, 0, 0 );
6213         }
6214         if ( aExp == 0 ) {
6215             ++expDiff;
6216         }
6217         else {
6218             aSig0 |= LIT64( 0x0001000000000000 );
6219         }
6220         shift128ExtraRightJamming(
6221             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6222         zExp = bExp;
6223     }
6224     else {
6225         if ( aExp == 0x7FFF ) {
6226             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6227                 return propagateFloat128NaN(a, b, status);
6228             }
6229             return a;
6230         }
6231         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6232         if ( aExp == 0 ) {
6233             if (status->flush_to_zero) {
6234                 if (zSig0 | zSig1) {
6235                     float_raise(float_flag_output_denormal, status);
6236                 }
6237                 return packFloat128(zSign, 0, 0, 0);
6238             }
6239             return packFloat128( zSign, 0, zSig0, zSig1 );
6240         }
6241         zSig2 = 0;
6242         zSig0 |= LIT64( 0x0002000000000000 );
6243         zExp = aExp;
6244         goto shiftRight1;
6245     }
6246     aSig0 |= LIT64( 0x0001000000000000 );
6247     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6248     --zExp;
6249     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6250     ++zExp;
6251  shiftRight1:
6252     shift128ExtraRightJamming(
6253         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6254  roundAndPack:
6255     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6256 
6257 }
6258 
6259 /*----------------------------------------------------------------------------
6260 | Returns the result of subtracting the absolute values of the quadruple-
6261 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6262 | difference is negated before being returned.  `zSign' is ignored if the
6263 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6264 | Standard for Binary Floating-Point Arithmetic.
6265 *----------------------------------------------------------------------------*/
6266 
6267 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6268                                 float_status *status)
6269 {
6270     int32_t aExp, bExp, zExp;
6271     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6272     int32_t expDiff;
6273 
6274     aSig1 = extractFloat128Frac1( a );
6275     aSig0 = extractFloat128Frac0( a );
6276     aExp = extractFloat128Exp( a );
6277     bSig1 = extractFloat128Frac1( b );
6278     bSig0 = extractFloat128Frac0( b );
6279     bExp = extractFloat128Exp( b );
6280     expDiff = aExp - bExp;
6281     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6282     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6283     if ( 0 < expDiff ) goto aExpBigger;
6284     if ( expDiff < 0 ) goto bExpBigger;
6285     if ( aExp == 0x7FFF ) {
6286         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6287             return propagateFloat128NaN(a, b, status);
6288         }
6289         float_raise(float_flag_invalid, status);
6290         return float128_default_nan(status);
6291     }
6292     if ( aExp == 0 ) {
6293         aExp = 1;
6294         bExp = 1;
6295     }
6296     if ( bSig0 < aSig0 ) goto aBigger;
6297     if ( aSig0 < bSig0 ) goto bBigger;
6298     if ( bSig1 < aSig1 ) goto aBigger;
6299     if ( aSig1 < bSig1 ) goto bBigger;
6300     return packFloat128(status->float_rounding_mode == float_round_down,
6301                         0, 0, 0);
6302  bExpBigger:
6303     if ( bExp == 0x7FFF ) {
6304         if (bSig0 | bSig1) {
6305             return propagateFloat128NaN(a, b, status);
6306         }
6307         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6308     }
6309     if ( aExp == 0 ) {
6310         ++expDiff;
6311     }
6312     else {
6313         aSig0 |= LIT64( 0x4000000000000000 );
6314     }
6315     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6316     bSig0 |= LIT64( 0x4000000000000000 );
6317  bBigger:
6318     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6319     zExp = bExp;
6320     zSign ^= 1;
6321     goto normalizeRoundAndPack;
6322  aExpBigger:
6323     if ( aExp == 0x7FFF ) {
6324         if (aSig0 | aSig1) {
6325             return propagateFloat128NaN(a, b, status);
6326         }
6327         return a;
6328     }
6329     if ( bExp == 0 ) {
6330         --expDiff;
6331     }
6332     else {
6333         bSig0 |= LIT64( 0x4000000000000000 );
6334     }
6335     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6336     aSig0 |= LIT64( 0x4000000000000000 );
6337  aBigger:
6338     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6339     zExp = aExp;
6340  normalizeRoundAndPack:
6341     --zExp;
6342     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6343                                          status);
6344 
6345 }
6346 
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of adding the quadruple-precision floating-point values
6349 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6350 | for Binary Floating-Point Arithmetic.
6351 *----------------------------------------------------------------------------*/
6352 
6353 float128 float128_add(float128 a, float128 b, float_status *status)
6354 {
6355     flag aSign, bSign;
6356 
6357     aSign = extractFloat128Sign( a );
6358     bSign = extractFloat128Sign( b );
6359     if ( aSign == bSign ) {
6360         return addFloat128Sigs(a, b, aSign, status);
6361     }
6362     else {
6363         return subFloat128Sigs(a, b, aSign, status);
6364     }
6365 
6366 }
6367 
6368 /*----------------------------------------------------------------------------
6369 | Returns the result of subtracting the quadruple-precision floating-point
6370 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6371 | Standard for Binary Floating-Point Arithmetic.
6372 *----------------------------------------------------------------------------*/
6373 
6374 float128 float128_sub(float128 a, float128 b, float_status *status)
6375 {
6376     flag aSign, bSign;
6377 
6378     aSign = extractFloat128Sign( a );
6379     bSign = extractFloat128Sign( b );
6380     if ( aSign == bSign ) {
6381         return subFloat128Sigs(a, b, aSign, status);
6382     }
6383     else {
6384         return addFloat128Sigs(a, b, aSign, status);
6385     }
6386 
6387 }
6388 
6389 /*----------------------------------------------------------------------------
6390 | Returns the result of multiplying the quadruple-precision floating-point
6391 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6392 | Standard for Binary Floating-Point Arithmetic.
6393 *----------------------------------------------------------------------------*/
6394 
6395 float128 float128_mul(float128 a, float128 b, float_status *status)
6396 {
6397     flag aSign, bSign, zSign;
6398     int32_t aExp, bExp, zExp;
6399     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6400 
6401     aSig1 = extractFloat128Frac1( a );
6402     aSig0 = extractFloat128Frac0( a );
6403     aExp = extractFloat128Exp( a );
6404     aSign = extractFloat128Sign( a );
6405     bSig1 = extractFloat128Frac1( b );
6406     bSig0 = extractFloat128Frac0( b );
6407     bExp = extractFloat128Exp( b );
6408     bSign = extractFloat128Sign( b );
6409     zSign = aSign ^ bSign;
6410     if ( aExp == 0x7FFF ) {
6411         if (    ( aSig0 | aSig1 )
6412              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6413             return propagateFloat128NaN(a, b, status);
6414         }
6415         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6416         return packFloat128( zSign, 0x7FFF, 0, 0 );
6417     }
6418     if ( bExp == 0x7FFF ) {
6419         if (bSig0 | bSig1) {
6420             return propagateFloat128NaN(a, b, status);
6421         }
6422         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6423  invalid:
6424             float_raise(float_flag_invalid, status);
6425             return float128_default_nan(status);
6426         }
6427         return packFloat128( zSign, 0x7FFF, 0, 0 );
6428     }
6429     if ( aExp == 0 ) {
6430         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6431         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6432     }
6433     if ( bExp == 0 ) {
6434         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6435         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6436     }
6437     zExp = aExp + bExp - 0x4000;
6438     aSig0 |= LIT64( 0x0001000000000000 );
6439     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6440     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6441     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6442     zSig2 |= ( zSig3 != 0 );
6443     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6444         shift128ExtraRightJamming(
6445             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6446         ++zExp;
6447     }
6448     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6449 
6450 }
6451 
6452 /*----------------------------------------------------------------------------
6453 | Returns the result of dividing the quadruple-precision floating-point value
6454 | `a' by the corresponding value `b'.  The operation is performed according to
6455 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6456 *----------------------------------------------------------------------------*/
6457 
6458 float128 float128_div(float128 a, float128 b, float_status *status)
6459 {
6460     flag aSign, bSign, zSign;
6461     int32_t aExp, bExp, zExp;
6462     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6463     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6464 
6465     aSig1 = extractFloat128Frac1( a );
6466     aSig0 = extractFloat128Frac0( a );
6467     aExp = extractFloat128Exp( a );
6468     aSign = extractFloat128Sign( a );
6469     bSig1 = extractFloat128Frac1( b );
6470     bSig0 = extractFloat128Frac0( b );
6471     bExp = extractFloat128Exp( b );
6472     bSign = extractFloat128Sign( b );
6473     zSign = aSign ^ bSign;
6474     if ( aExp == 0x7FFF ) {
6475         if (aSig0 | aSig1) {
6476             return propagateFloat128NaN(a, b, status);
6477         }
6478         if ( bExp == 0x7FFF ) {
6479             if (bSig0 | bSig1) {
6480                 return propagateFloat128NaN(a, b, status);
6481             }
6482             goto invalid;
6483         }
6484         return packFloat128( zSign, 0x7FFF, 0, 0 );
6485     }
6486     if ( bExp == 0x7FFF ) {
6487         if (bSig0 | bSig1) {
6488             return propagateFloat128NaN(a, b, status);
6489         }
6490         return packFloat128( zSign, 0, 0, 0 );
6491     }
6492     if ( bExp == 0 ) {
6493         if ( ( bSig0 | bSig1 ) == 0 ) {
6494             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6495  invalid:
6496                 float_raise(float_flag_invalid, status);
6497                 return float128_default_nan(status);
6498             }
6499             float_raise(float_flag_divbyzero, status);
6500             return packFloat128( zSign, 0x7FFF, 0, 0 );
6501         }
6502         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6503     }
6504     if ( aExp == 0 ) {
6505         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6506         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6507     }
6508     zExp = aExp - bExp + 0x3FFD;
6509     shortShift128Left(
6510         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6511     shortShift128Left(
6512         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6513     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6514         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6515         ++zExp;
6516     }
6517     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6518     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6519     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6520     while ( (int64_t) rem0 < 0 ) {
6521         --zSig0;
6522         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6523     }
6524     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6525     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6526         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6527         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6528         while ( (int64_t) rem1 < 0 ) {
6529             --zSig1;
6530             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6531         }
6532         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6533     }
6534     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6535     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6536 
6537 }
6538 
6539 /*----------------------------------------------------------------------------
6540 | Returns the remainder of the quadruple-precision floating-point value `a'
6541 | with respect to the corresponding value `b'.  The operation is performed
6542 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6543 *----------------------------------------------------------------------------*/
6544 
6545 float128 float128_rem(float128 a, float128 b, float_status *status)
6546 {
6547     flag aSign, zSign;
6548     int32_t aExp, bExp, expDiff;
6549     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6550     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6551     int64_t sigMean0;
6552 
6553     aSig1 = extractFloat128Frac1( a );
6554     aSig0 = extractFloat128Frac0( a );
6555     aExp = extractFloat128Exp( a );
6556     aSign = extractFloat128Sign( a );
6557     bSig1 = extractFloat128Frac1( b );
6558     bSig0 = extractFloat128Frac0( b );
6559     bExp = extractFloat128Exp( b );
6560     if ( aExp == 0x7FFF ) {
6561         if (    ( aSig0 | aSig1 )
6562              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6563             return propagateFloat128NaN(a, b, status);
6564         }
6565         goto invalid;
6566     }
6567     if ( bExp == 0x7FFF ) {
6568         if (bSig0 | bSig1) {
6569             return propagateFloat128NaN(a, b, status);
6570         }
6571         return a;
6572     }
6573     if ( bExp == 0 ) {
6574         if ( ( bSig0 | bSig1 ) == 0 ) {
6575  invalid:
6576             float_raise(float_flag_invalid, status);
6577             return float128_default_nan(status);
6578         }
6579         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6580     }
6581     if ( aExp == 0 ) {
6582         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6583         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6584     }
6585     expDiff = aExp - bExp;
6586     if ( expDiff < -1 ) return a;
6587     shortShift128Left(
6588         aSig0 | LIT64( 0x0001000000000000 ),
6589         aSig1,
6590         15 - ( expDiff < 0 ),
6591         &aSig0,
6592         &aSig1
6593     );
6594     shortShift128Left(
6595         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6596     q = le128( bSig0, bSig1, aSig0, aSig1 );
6597     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6598     expDiff -= 64;
6599     while ( 0 < expDiff ) {
6600         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6601         q = ( 4 < q ) ? q - 4 : 0;
6602         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6603         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6604         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6605         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6606         expDiff -= 61;
6607     }
6608     if ( -64 < expDiff ) {
6609         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6610         q = ( 4 < q ) ? q - 4 : 0;
6611         q >>= - expDiff;
6612         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6613         expDiff += 52;
6614         if ( expDiff < 0 ) {
6615             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6616         }
6617         else {
6618             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6619         }
6620         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6621         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6622     }
6623     else {
6624         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6625         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6626     }
6627     do {
6628         alternateASig0 = aSig0;
6629         alternateASig1 = aSig1;
6630         ++q;
6631         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6632     } while ( 0 <= (int64_t) aSig0 );
6633     add128(
6634         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6635     if (    ( sigMean0 < 0 )
6636          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6637         aSig0 = alternateASig0;
6638         aSig1 = alternateASig1;
6639     }
6640     zSign = ( (int64_t) aSig0 < 0 );
6641     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6642     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6643                                          status);
6644 }
6645 
6646 /*----------------------------------------------------------------------------
6647 | Returns the square root of the quadruple-precision floating-point value `a'.
6648 | The operation is performed according to the IEC/IEEE Standard for Binary
6649 | Floating-Point Arithmetic.
6650 *----------------------------------------------------------------------------*/
6651 
6652 float128 float128_sqrt(float128 a, float_status *status)
6653 {
6654     flag aSign;
6655     int32_t aExp, zExp;
6656     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6657     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6658 
6659     aSig1 = extractFloat128Frac1( a );
6660     aSig0 = extractFloat128Frac0( a );
6661     aExp = extractFloat128Exp( a );
6662     aSign = extractFloat128Sign( a );
6663     if ( aExp == 0x7FFF ) {
6664         if (aSig0 | aSig1) {
6665             return propagateFloat128NaN(a, a, status);
6666         }
6667         if ( ! aSign ) return a;
6668         goto invalid;
6669     }
6670     if ( aSign ) {
6671         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6672  invalid:
6673         float_raise(float_flag_invalid, status);
6674         return float128_default_nan(status);
6675     }
6676     if ( aExp == 0 ) {
6677         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6678         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6679     }
6680     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6681     aSig0 |= LIT64( 0x0001000000000000 );
6682     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6683     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6684     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6685     doubleZSig0 = zSig0<<1;
6686     mul64To128( zSig0, zSig0, &term0, &term1 );
6687     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6688     while ( (int64_t) rem0 < 0 ) {
6689         --zSig0;
6690         doubleZSig0 -= 2;
6691         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6692     }
6693     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6694     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6695         if ( zSig1 == 0 ) zSig1 = 1;
6696         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6697         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6698         mul64To128( zSig1, zSig1, &term2, &term3 );
6699         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6700         while ( (int64_t) rem1 < 0 ) {
6701             --zSig1;
6702             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6703             term3 |= 1;
6704             term2 |= doubleZSig0;
6705             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6706         }
6707         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6708     }
6709     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6710     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6711 
6712 }
6713 
6714 /*----------------------------------------------------------------------------
6715 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6716 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6717 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6718 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6719 *----------------------------------------------------------------------------*/
6720 
6721 int float128_eq(float128 a, float128 b, float_status *status)
6722 {
6723 
6724     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6725               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6726          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6727               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6728        ) {
6729         float_raise(float_flag_invalid, status);
6730         return 0;
6731     }
6732     return
6733            ( a.low == b.low )
6734         && (    ( a.high == b.high )
6735              || (    ( a.low == 0 )
6736                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6737            );
6738 
6739 }
6740 
6741 /*----------------------------------------------------------------------------
6742 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6743 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6744 | exception is raised if either operand is a NaN.  The comparison is performed
6745 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6746 *----------------------------------------------------------------------------*/
6747 
6748 int float128_le(float128 a, float128 b, float_status *status)
6749 {
6750     flag aSign, bSign;
6751 
6752     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6753               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6754          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6755               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6756        ) {
6757         float_raise(float_flag_invalid, status);
6758         return 0;
6759     }
6760     aSign = extractFloat128Sign( a );
6761     bSign = extractFloat128Sign( b );
6762     if ( aSign != bSign ) {
6763         return
6764                aSign
6765             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6766                  == 0 );
6767     }
6768     return
6769           aSign ? le128( b.high, b.low, a.high, a.low )
6770         : le128( a.high, a.low, b.high, b.low );
6771 
6772 }
6773 
6774 /*----------------------------------------------------------------------------
6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6776 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6777 | raised if either operand is a NaN.  The comparison is performed according
6778 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6780 
6781 int float128_lt(float128 a, float128 b, float_status *status)
6782 {
6783     flag aSign, bSign;
6784 
6785     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6786               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6788               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6789        ) {
6790         float_raise(float_flag_invalid, status);
6791         return 0;
6792     }
6793     aSign = extractFloat128Sign( a );
6794     bSign = extractFloat128Sign( b );
6795     if ( aSign != bSign ) {
6796         return
6797                aSign
6798             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6799                  != 0 );
6800     }
6801     return
6802           aSign ? lt128( b.high, b.low, a.high, a.low )
6803         : lt128( a.high, a.low, b.high, b.low );
6804 
6805 }
6806 
6807 /*----------------------------------------------------------------------------
6808 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6809 | be compared, and 0 otherwise.  The invalid exception is raised if either
6810 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6811 | Standard for Binary Floating-Point Arithmetic.
6812 *----------------------------------------------------------------------------*/
6813 
6814 int float128_unordered(float128 a, float128 b, float_status *status)
6815 {
6816     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6817               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6818          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6819               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6820        ) {
6821         float_raise(float_flag_invalid, status);
6822         return 1;
6823     }
6824     return 0;
6825 }
6826 
6827 /*----------------------------------------------------------------------------
6828 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6829 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6830 | exception.  The comparison is performed according to the IEC/IEEE Standard
6831 | for Binary Floating-Point Arithmetic.
6832 *----------------------------------------------------------------------------*/
6833 
6834 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6835 {
6836 
6837     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6838               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6839          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6840               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6841        ) {
6842         if (float128_is_signaling_nan(a, status)
6843          || float128_is_signaling_nan(b, status)) {
6844             float_raise(float_flag_invalid, status);
6845         }
6846         return 0;
6847     }
6848     return
6849            ( a.low == b.low )
6850         && (    ( a.high == b.high )
6851              || (    ( a.low == 0 )
6852                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6853            );
6854 
6855 }
6856 
6857 /*----------------------------------------------------------------------------
6858 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6859 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6860 | cause an exception.  Otherwise, the comparison is performed according to the
6861 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6862 *----------------------------------------------------------------------------*/
6863 
6864 int float128_le_quiet(float128 a, float128 b, float_status *status)
6865 {
6866     flag aSign, bSign;
6867 
6868     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6869               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6870          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6871               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6872        ) {
6873         if (float128_is_signaling_nan(a, status)
6874          || float128_is_signaling_nan(b, status)) {
6875             float_raise(float_flag_invalid, status);
6876         }
6877         return 0;
6878     }
6879     aSign = extractFloat128Sign( a );
6880     bSign = extractFloat128Sign( b );
6881     if ( aSign != bSign ) {
6882         return
6883                aSign
6884             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6885                  == 0 );
6886     }
6887     return
6888           aSign ? le128( b.high, b.low, a.high, a.low )
6889         : le128( a.high, a.low, b.high, b.low );
6890 
6891 }
6892 
6893 /*----------------------------------------------------------------------------
6894 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6895 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6896 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6897 | Standard for Binary Floating-Point Arithmetic.
6898 *----------------------------------------------------------------------------*/
6899 
6900 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6901 {
6902     flag aSign, bSign;
6903 
6904     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6905               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6907               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6908        ) {
6909         if (float128_is_signaling_nan(a, status)
6910          || float128_is_signaling_nan(b, status)) {
6911             float_raise(float_flag_invalid, status);
6912         }
6913         return 0;
6914     }
6915     aSign = extractFloat128Sign( a );
6916     bSign = extractFloat128Sign( b );
6917     if ( aSign != bSign ) {
6918         return
6919                aSign
6920             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6921                  != 0 );
6922     }
6923     return
6924           aSign ? lt128( b.high, b.low, a.high, a.low )
6925         : lt128( a.high, a.low, b.high, b.low );
6926 
6927 }
6928 
6929 /*----------------------------------------------------------------------------
6930 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6931 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6932 | comparison is performed according to the IEC/IEEE Standard for Binary
6933 | Floating-Point Arithmetic.
6934 *----------------------------------------------------------------------------*/
6935 
6936 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6937 {
6938     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6939               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6940          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6941               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6942        ) {
6943         if (float128_is_signaling_nan(a, status)
6944          || float128_is_signaling_nan(b, status)) {
6945             float_raise(float_flag_invalid, status);
6946         }
6947         return 1;
6948     }
6949     return 0;
6950 }
6951 
6952 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6953                                             int is_quiet, float_status *status)
6954 {
6955     flag aSign, bSign;
6956 
6957     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6958         float_raise(float_flag_invalid, status);
6959         return float_relation_unordered;
6960     }
6961     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6962           ( extractFloatx80Frac( a )<<1 ) ) ||
6963         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6964           ( extractFloatx80Frac( b )<<1 ) )) {
6965         if (!is_quiet ||
6966             floatx80_is_signaling_nan(a, status) ||
6967             floatx80_is_signaling_nan(b, status)) {
6968             float_raise(float_flag_invalid, status);
6969         }
6970         return float_relation_unordered;
6971     }
6972     aSign = extractFloatx80Sign( a );
6973     bSign = extractFloatx80Sign( b );
6974     if ( aSign != bSign ) {
6975 
6976         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6977              ( ( a.low | b.low ) == 0 ) ) {
6978             /* zero case */
6979             return float_relation_equal;
6980         } else {
6981             return 1 - (2 * aSign);
6982         }
6983     } else {
6984         if (a.low == b.low && a.high == b.high) {
6985             return float_relation_equal;
6986         } else {
6987             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6988         }
6989     }
6990 }
6991 
6992 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6993 {
6994     return floatx80_compare_internal(a, b, 0, status);
6995 }
6996 
6997 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6998 {
6999     return floatx80_compare_internal(a, b, 1, status);
7000 }
7001 
7002 static inline int float128_compare_internal(float128 a, float128 b,
7003                                             int is_quiet, float_status *status)
7004 {
7005     flag aSign, bSign;
7006 
7007     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7008           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7009         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7010           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7011         if (!is_quiet ||
7012             float128_is_signaling_nan(a, status) ||
7013             float128_is_signaling_nan(b, status)) {
7014             float_raise(float_flag_invalid, status);
7015         }
7016         return float_relation_unordered;
7017     }
7018     aSign = extractFloat128Sign( a );
7019     bSign = extractFloat128Sign( b );
7020     if ( aSign != bSign ) {
7021         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7022             /* zero case */
7023             return float_relation_equal;
7024         } else {
7025             return 1 - (2 * aSign);
7026         }
7027     } else {
7028         if (a.low == b.low && a.high == b.high) {
7029             return float_relation_equal;
7030         } else {
7031             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7032         }
7033     }
7034 }
7035 
7036 int float128_compare(float128 a, float128 b, float_status *status)
7037 {
7038     return float128_compare_internal(a, b, 0, status);
7039 }
7040 
7041 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7042 {
7043     return float128_compare_internal(a, b, 1, status);
7044 }
7045 
7046 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7047 {
7048     flag aSign;
7049     int32_t aExp;
7050     uint64_t aSig;
7051 
7052     if (floatx80_invalid_encoding(a)) {
7053         float_raise(float_flag_invalid, status);
7054         return floatx80_default_nan(status);
7055     }
7056     aSig = extractFloatx80Frac( a );
7057     aExp = extractFloatx80Exp( a );
7058     aSign = extractFloatx80Sign( a );
7059 
7060     if ( aExp == 0x7FFF ) {
7061         if ( aSig<<1 ) {
7062             return propagateFloatx80NaN(a, a, status);
7063         }
7064         return a;
7065     }
7066 
7067     if (aExp == 0) {
7068         if (aSig == 0) {
7069             return a;
7070         }
7071         aExp++;
7072     }
7073 
7074     if (n > 0x10000) {
7075         n = 0x10000;
7076     } else if (n < -0x10000) {
7077         n = -0x10000;
7078     }
7079 
7080     aExp += n;
7081     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7082                                          aSign, aExp, aSig, 0, status);
7083 }
7084 
7085 float128 float128_scalbn(float128 a, int n, float_status *status)
7086 {
7087     flag aSign;
7088     int32_t aExp;
7089     uint64_t aSig0, aSig1;
7090 
7091     aSig1 = extractFloat128Frac1( a );
7092     aSig0 = extractFloat128Frac0( a );
7093     aExp = extractFloat128Exp( a );
7094     aSign = extractFloat128Sign( a );
7095     if ( aExp == 0x7FFF ) {
7096         if ( aSig0 | aSig1 ) {
7097             return propagateFloat128NaN(a, a, status);
7098         }
7099         return a;
7100     }
7101     if (aExp != 0) {
7102         aSig0 |= LIT64( 0x0001000000000000 );
7103     } else if (aSig0 == 0 && aSig1 == 0) {
7104         return a;
7105     } else {
7106         aExp++;
7107     }
7108 
7109     if (n > 0x10000) {
7110         n = 0x10000;
7111     } else if (n < -0x10000) {
7112         n = -0x10000;
7113     }
7114 
7115     aExp += n - 1;
7116     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7117                                          , status);
7118 
7119 }
7120