xref: /openbmc/qemu/fpu/softfloat.c (revision a3fc8396352e945f9d14cac0237ebf9d91745969)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Returns the fraction bits of the single-precision floating-point value `a'.
137 *----------------------------------------------------------------------------*/
138 
139 static inline uint32_t extractFloat32Frac(float32 a)
140 {
141     return float32_val(a) & 0x007FFFFF;
142 }
143 
144 /*----------------------------------------------------------------------------
145 | Returns the exponent bits of the single-precision floating-point value `a'.
146 *----------------------------------------------------------------------------*/
147 
148 static inline int extractFloat32Exp(float32 a)
149 {
150     return (float32_val(a) >> 23) & 0xFF;
151 }
152 
153 /*----------------------------------------------------------------------------
154 | Returns the sign bit of the single-precision floating-point value `a'.
155 *----------------------------------------------------------------------------*/
156 
157 static inline flag extractFloat32Sign(float32 a)
158 {
159     return float32_val(a) >> 31;
160 }
161 
162 /*----------------------------------------------------------------------------
163 | Returns the fraction bits of the double-precision floating-point value `a'.
164 *----------------------------------------------------------------------------*/
165 
166 static inline uint64_t extractFloat64Frac(float64 a)
167 {
168     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
169 }
170 
171 /*----------------------------------------------------------------------------
172 | Returns the exponent bits of the double-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
174 
175 static inline int extractFloat64Exp(float64 a)
176 {
177     return (float64_val(a) >> 52) & 0x7FF;
178 }
179 
180 /*----------------------------------------------------------------------------
181 | Returns the sign bit of the double-precision floating-point value `a'.
182 *----------------------------------------------------------------------------*/
183 
184 static inline flag extractFloat64Sign(float64 a)
185 {
186     return float64_val(a) >> 63;
187 }
188 
189 /*
190  * Classify a floating point number. Everything above float_class_qnan
191  * is a NaN so cls >= float_class_qnan is any NaN.
192  */
193 
194 typedef enum __attribute__ ((__packed__)) {
195     float_class_unclassified,
196     float_class_zero,
197     float_class_normal,
198     float_class_inf,
199     float_class_qnan,  /* all NaNs from here */
200     float_class_snan,
201     float_class_dnan,
202     float_class_msnan, /* maybe silenced */
203 } FloatClass;
204 
205 /*
206  * Structure holding all of the decomposed parts of a float. The
207  * exponent is unbiased and the fraction is normalized. All
208  * calculations are done with a 64 bit fraction and then rounded as
209  * appropriate for the final format.
210  *
211  * Thanks to the packed FloatClass a decent compiler should be able to
212  * fit the whole structure into registers and avoid using the stack
213  * for parameter passing.
214  */
215 
216 typedef struct {
217     uint64_t frac;
218     int32_t  exp;
219     FloatClass cls;
220     bool sign;
221 } FloatParts;
222 
223 #define DECOMPOSED_BINARY_POINT    (64 - 2)
224 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
225 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
226 
227 /* Structure holding all of the relevant parameters for a format.
228  *   exp_size: the size of the exponent field
229  *   exp_bias: the offset applied to the exponent field
230  *   exp_max: the maximum normalised exponent
231  *   frac_size: the size of the fraction field
232  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
233  * The following are computed based the size of fraction
234  *   frac_lsb: least significant bit of fraction
235  *   fram_lsbm1: the bit bellow the least significant bit (for rounding)
236  *   round_mask/roundeven_mask: masks used for rounding
237  */
238 typedef struct {
239     int exp_size;
240     int exp_bias;
241     int exp_max;
242     int frac_size;
243     int frac_shift;
244     uint64_t frac_lsb;
245     uint64_t frac_lsbm1;
246     uint64_t round_mask;
247     uint64_t roundeven_mask;
248 } FloatFmt;
249 
250 /* Expand fields based on the size of exponent and fraction */
251 #define FLOAT_PARAMS(E, F)                                           \
252     .exp_size       = E,                                             \
253     .exp_bias       = ((1 << E) - 1) >> 1,                           \
254     .exp_max        = (1 << E) - 1,                                  \
255     .frac_size      = F,                                             \
256     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
257     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
258     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
259     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
260     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
261 
262 static const FloatFmt float16_params = {
263     FLOAT_PARAMS(5, 10)
264 };
265 
266 static const FloatFmt float32_params = {
267     FLOAT_PARAMS(8, 23)
268 };
269 
270 static const FloatFmt float64_params = {
271     FLOAT_PARAMS(11, 52)
272 };
273 
274 /* Unpack a float to parts, but do not canonicalize.  */
275 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
276 {
277     const int sign_pos = fmt.frac_size + fmt.exp_size;
278 
279     return (FloatParts) {
280         .cls = float_class_unclassified,
281         .sign = extract64(raw, sign_pos, 1),
282         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
283         .frac = extract64(raw, 0, fmt.frac_size),
284     };
285 }
286 
287 static inline FloatParts float16_unpack_raw(float16 f)
288 {
289     return unpack_raw(float16_params, f);
290 }
291 
292 static inline FloatParts float32_unpack_raw(float32 f)
293 {
294     return unpack_raw(float32_params, f);
295 }
296 
297 static inline FloatParts float64_unpack_raw(float64 f)
298 {
299     return unpack_raw(float64_params, f);
300 }
301 
302 /* Pack a float from parts, but do not canonicalize.  */
303 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
304 {
305     const int sign_pos = fmt.frac_size + fmt.exp_size;
306     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
307     return deposit64(ret, sign_pos, 1, p.sign);
308 }
309 
310 static inline float16 float16_pack_raw(FloatParts p)
311 {
312     return make_float16(pack_raw(float16_params, p));
313 }
314 
315 static inline float32 float32_pack_raw(FloatParts p)
316 {
317     return make_float32(pack_raw(float32_params, p));
318 }
319 
320 static inline float64 float64_pack_raw(FloatParts p)
321 {
322     return make_float64(pack_raw(float64_params, p));
323 }
324 
325 /* Canonicalize EXP and FRAC, setting CLS.  */
326 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
327                                float_status *status)
328 {
329     if (part.exp == parm->exp_max) {
330         if (part.frac == 0) {
331             part.cls = float_class_inf;
332         } else {
333 #ifdef NO_SIGNALING_NANS
334             part.cls = float_class_qnan;
335 #else
336             int64_t msb = part.frac << (parm->frac_shift + 2);
337             if ((msb < 0) == status->snan_bit_is_one) {
338                 part.cls = float_class_snan;
339             } else {
340                 part.cls = float_class_qnan;
341             }
342 #endif
343         }
344     } else if (part.exp == 0) {
345         if (likely(part.frac == 0)) {
346             part.cls = float_class_zero;
347         } else if (status->flush_inputs_to_zero) {
348             float_raise(float_flag_input_denormal, status);
349             part.cls = float_class_zero;
350             part.frac = 0;
351         } else {
352             int shift = clz64(part.frac) - 1;
353             part.cls = float_class_normal;
354             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
355             part.frac <<= shift;
356         }
357     } else {
358         part.cls = float_class_normal;
359         part.exp -= parm->exp_bias;
360         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
361     }
362     return part;
363 }
364 
365 /* Round and uncanonicalize a floating-point number by parts. There
366  * are FRAC_SHIFT bits that may require rounding at the bottom of the
367  * fraction; these bits will be removed. The exponent will be biased
368  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
369  */
370 
371 static FloatParts round_canonical(FloatParts p, float_status *s,
372                                   const FloatFmt *parm)
373 {
374     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
375     const uint64_t round_mask = parm->round_mask;
376     const uint64_t roundeven_mask = parm->roundeven_mask;
377     const int exp_max = parm->exp_max;
378     const int frac_shift = parm->frac_shift;
379     uint64_t frac, inc;
380     int exp, flags = 0;
381     bool overflow_norm;
382 
383     frac = p.frac;
384     exp = p.exp;
385 
386     switch (p.cls) {
387     case float_class_normal:
388         switch (s->float_rounding_mode) {
389         case float_round_nearest_even:
390             overflow_norm = false;
391             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
392             break;
393         case float_round_ties_away:
394             overflow_norm = false;
395             inc = frac_lsbm1;
396             break;
397         case float_round_to_zero:
398             overflow_norm = true;
399             inc = 0;
400             break;
401         case float_round_up:
402             inc = p.sign ? 0 : round_mask;
403             overflow_norm = p.sign;
404             break;
405         case float_round_down:
406             inc = p.sign ? round_mask : 0;
407             overflow_norm = !p.sign;
408             break;
409         default:
410             g_assert_not_reached();
411         }
412 
413         exp += parm->exp_bias;
414         if (likely(exp > 0)) {
415             if (frac & round_mask) {
416                 flags |= float_flag_inexact;
417                 frac += inc;
418                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
419                     frac >>= 1;
420                     exp++;
421                 }
422             }
423             frac >>= frac_shift;
424 
425             if (unlikely(exp >= exp_max)) {
426                 flags |= float_flag_overflow | float_flag_inexact;
427                 if (overflow_norm) {
428                     exp = exp_max - 1;
429                     frac = -1;
430                 } else {
431                     p.cls = float_class_inf;
432                     goto do_inf;
433                 }
434             }
435         } else if (s->flush_to_zero) {
436             flags |= float_flag_output_denormal;
437             p.cls = float_class_zero;
438             goto do_zero;
439         } else {
440             bool is_tiny = (s->float_detect_tininess
441                             == float_tininess_before_rounding)
442                         || (exp < 0)
443                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
444 
445             shift64RightJamming(frac, 1 - exp, &frac);
446             if (frac & round_mask) {
447                 /* Need to recompute round-to-even.  */
448                 if (s->float_rounding_mode == float_round_nearest_even) {
449                     inc = ((frac & roundeven_mask) != frac_lsbm1
450                            ? frac_lsbm1 : 0);
451                 }
452                 flags |= float_flag_inexact;
453                 frac += inc;
454             }
455 
456             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
457             frac >>= frac_shift;
458 
459             if (is_tiny && (flags & float_flag_inexact)) {
460                 flags |= float_flag_underflow;
461             }
462             if (exp == 0 && frac == 0) {
463                 p.cls = float_class_zero;
464             }
465         }
466         break;
467 
468     case float_class_zero:
469     do_zero:
470         exp = 0;
471         frac = 0;
472         break;
473 
474     case float_class_inf:
475     do_inf:
476         exp = exp_max;
477         frac = 0;
478         break;
479 
480     case float_class_qnan:
481     case float_class_snan:
482         exp = exp_max;
483         break;
484 
485     default:
486         g_assert_not_reached();
487     }
488 
489     float_raise(flags, s);
490     p.exp = exp;
491     p.frac = frac;
492     return p;
493 }
494 
495 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
496 {
497     return canonicalize(float16_unpack_raw(f), &float16_params, s);
498 }
499 
500 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
501 {
502     switch (p.cls) {
503     case float_class_dnan:
504         return float16_default_nan(s);
505     case float_class_msnan:
506         return float16_maybe_silence_nan(float16_pack_raw(p), s);
507     default:
508         p = round_canonical(p, s, &float16_params);
509         return float16_pack_raw(p);
510     }
511 }
512 
513 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
514 {
515     return canonicalize(float32_unpack_raw(f), &float32_params, s);
516 }
517 
518 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
519 {
520     switch (p.cls) {
521     case float_class_dnan:
522         return float32_default_nan(s);
523     case float_class_msnan:
524         return float32_maybe_silence_nan(float32_pack_raw(p), s);
525     default:
526         p = round_canonical(p, s, &float32_params);
527         return float32_pack_raw(p);
528     }
529 }
530 
531 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
532 {
533     return canonicalize(float64_unpack_raw(f), &float64_params, s);
534 }
535 
536 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
537 {
538     switch (p.cls) {
539     case float_class_dnan:
540         return float64_default_nan(s);
541     case float_class_msnan:
542         return float64_maybe_silence_nan(float64_pack_raw(p), s);
543     default:
544         p = round_canonical(p, s, &float64_params);
545         return float64_pack_raw(p);
546     }
547 }
548 
549 /* Simple helpers for checking if what NaN we have */
550 static bool is_nan(FloatClass c)
551 {
552     return unlikely(c >= float_class_qnan);
553 }
554 static bool is_snan(FloatClass c)
555 {
556     return c == float_class_snan;
557 }
558 static bool is_qnan(FloatClass c)
559 {
560     return c == float_class_qnan;
561 }
562 
563 static FloatParts return_nan(FloatParts a, float_status *s)
564 {
565     switch (a.cls) {
566     case float_class_snan:
567         s->float_exception_flags |= float_flag_invalid;
568         a.cls = float_class_msnan;
569         /* fall through */
570     case float_class_qnan:
571         if (s->default_nan_mode) {
572             a.cls = float_class_dnan;
573         }
574         break;
575 
576     default:
577         g_assert_not_reached();
578     }
579     return a;
580 }
581 
582 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
583 {
584     if (is_snan(a.cls) || is_snan(b.cls)) {
585         s->float_exception_flags |= float_flag_invalid;
586     }
587 
588     if (s->default_nan_mode) {
589         a.cls = float_class_dnan;
590     } else {
591         if (pickNaN(is_qnan(a.cls), is_snan(a.cls),
592                     is_qnan(b.cls), is_snan(b.cls),
593                     a.frac > b.frac ||
594                     (a.frac == b.frac && a.sign < b.sign))) {
595             a = b;
596         }
597         a.cls = float_class_msnan;
598     }
599     return a;
600 }
601 
602 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
603                                   bool inf_zero, float_status *s)
604 {
605     int which;
606 
607     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
608         s->float_exception_flags |= float_flag_invalid;
609     }
610 
611     which = pickNaNMulAdd(is_qnan(a.cls), is_snan(a.cls),
612                           is_qnan(b.cls), is_snan(b.cls),
613                           is_qnan(c.cls), is_snan(c.cls),
614                           inf_zero, s);
615 
616     if (s->default_nan_mode) {
617         /* Note that this check is after pickNaNMulAdd so that function
618          * has an opportunity to set the Invalid flag.
619          */
620         a.cls = float_class_dnan;
621         return a;
622     }
623 
624     switch (which) {
625     case 0:
626         break;
627     case 1:
628         a = b;
629         break;
630     case 2:
631         a = c;
632         break;
633     case 3:
634         a.cls = float_class_dnan;
635         return a;
636     default:
637         g_assert_not_reached();
638     }
639     a.cls = float_class_msnan;
640 
641     return a;
642 }
643 
644 /*
645  * Returns the result of adding or subtracting the values of the
646  * floating-point values `a' and `b'. The operation is performed
647  * according to the IEC/IEEE Standard for Binary Floating-Point
648  * Arithmetic.
649  */
650 
651 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
652                                 float_status *s)
653 {
654     bool a_sign = a.sign;
655     bool b_sign = b.sign ^ subtract;
656 
657     if (a_sign != b_sign) {
658         /* Subtraction */
659 
660         if (a.cls == float_class_normal && b.cls == float_class_normal) {
661             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
662                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
663                 a.frac = a.frac - b.frac;
664             } else {
665                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
666                 a.frac = b.frac - a.frac;
667                 a.exp = b.exp;
668                 a_sign ^= 1;
669             }
670 
671             if (a.frac == 0) {
672                 a.cls = float_class_zero;
673                 a.sign = s->float_rounding_mode == float_round_down;
674             } else {
675                 int shift = clz64(a.frac) - 1;
676                 a.frac = a.frac << shift;
677                 a.exp = a.exp - shift;
678                 a.sign = a_sign;
679             }
680             return a;
681         }
682         if (is_nan(a.cls) || is_nan(b.cls)) {
683             return pick_nan(a, b, s);
684         }
685         if (a.cls == float_class_inf) {
686             if (b.cls == float_class_inf) {
687                 float_raise(float_flag_invalid, s);
688                 a.cls = float_class_dnan;
689             }
690             return a;
691         }
692         if (a.cls == float_class_zero && b.cls == float_class_zero) {
693             a.sign = s->float_rounding_mode == float_round_down;
694             return a;
695         }
696         if (a.cls == float_class_zero || b.cls == float_class_inf) {
697             b.sign = a_sign ^ 1;
698             return b;
699         }
700         if (b.cls == float_class_zero) {
701             return a;
702         }
703     } else {
704         /* Addition */
705         if (a.cls == float_class_normal && b.cls == float_class_normal) {
706             if (a.exp > b.exp) {
707                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
708             } else if (a.exp < b.exp) {
709                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
710                 a.exp = b.exp;
711             }
712             a.frac += b.frac;
713             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
714                 a.frac >>= 1;
715                 a.exp += 1;
716             }
717             return a;
718         }
719         if (is_nan(a.cls) || is_nan(b.cls)) {
720             return pick_nan(a, b, s);
721         }
722         if (a.cls == float_class_inf || b.cls == float_class_zero) {
723             return a;
724         }
725         if (b.cls == float_class_inf || a.cls == float_class_zero) {
726             b.sign = b_sign;
727             return b;
728         }
729     }
730     g_assert_not_reached();
731 }
732 
733 /*
734  * Returns the result of adding or subtracting the floating-point
735  * values `a' and `b'. The operation is performed according to the
736  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
737  */
738 
739 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
740                                               float_status *status)
741 {
742     FloatParts pa = float16_unpack_canonical(a, status);
743     FloatParts pb = float16_unpack_canonical(b, status);
744     FloatParts pr = addsub_floats(pa, pb, false, status);
745 
746     return float16_round_pack_canonical(pr, status);
747 }
748 
749 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
750                                              float_status *status)
751 {
752     FloatParts pa = float32_unpack_canonical(a, status);
753     FloatParts pb = float32_unpack_canonical(b, status);
754     FloatParts pr = addsub_floats(pa, pb, false, status);
755 
756     return float32_round_pack_canonical(pr, status);
757 }
758 
759 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
760                                              float_status *status)
761 {
762     FloatParts pa = float64_unpack_canonical(a, status);
763     FloatParts pb = float64_unpack_canonical(b, status);
764     FloatParts pr = addsub_floats(pa, pb, false, status);
765 
766     return float64_round_pack_canonical(pr, status);
767 }
768 
769 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
770                                              float_status *status)
771 {
772     FloatParts pa = float16_unpack_canonical(a, status);
773     FloatParts pb = float16_unpack_canonical(b, status);
774     FloatParts pr = addsub_floats(pa, pb, true, status);
775 
776     return float16_round_pack_canonical(pr, status);
777 }
778 
779 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
780                                              float_status *status)
781 {
782     FloatParts pa = float32_unpack_canonical(a, status);
783     FloatParts pb = float32_unpack_canonical(b, status);
784     FloatParts pr = addsub_floats(pa, pb, true, status);
785 
786     return float32_round_pack_canonical(pr, status);
787 }
788 
789 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
790                                              float_status *status)
791 {
792     FloatParts pa = float64_unpack_canonical(a, status);
793     FloatParts pb = float64_unpack_canonical(b, status);
794     FloatParts pr = addsub_floats(pa, pb, true, status);
795 
796     return float64_round_pack_canonical(pr, status);
797 }
798 
799 /*
800  * Returns the result of multiplying the floating-point values `a' and
801  * `b'. The operation is performed according to the IEC/IEEE Standard
802  * for Binary Floating-Point Arithmetic.
803  */
804 
805 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
806 {
807     bool sign = a.sign ^ b.sign;
808 
809     if (a.cls == float_class_normal && b.cls == float_class_normal) {
810         uint64_t hi, lo;
811         int exp = a.exp + b.exp;
812 
813         mul64To128(a.frac, b.frac, &hi, &lo);
814         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
815         if (lo & DECOMPOSED_OVERFLOW_BIT) {
816             shift64RightJamming(lo, 1, &lo);
817             exp += 1;
818         }
819 
820         /* Re-use a */
821         a.exp = exp;
822         a.sign = sign;
823         a.frac = lo;
824         return a;
825     }
826     /* handle all the NaN cases */
827     if (is_nan(a.cls) || is_nan(b.cls)) {
828         return pick_nan(a, b, s);
829     }
830     /* Inf * Zero == NaN */
831     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
832         (a.cls == float_class_zero && b.cls == float_class_inf)) {
833         s->float_exception_flags |= float_flag_invalid;
834         a.cls = float_class_dnan;
835         a.sign = sign;
836         return a;
837     }
838     /* Multiply by 0 or Inf */
839     if (a.cls == float_class_inf || a.cls == float_class_zero) {
840         a.sign = sign;
841         return a;
842     }
843     if (b.cls == float_class_inf || b.cls == float_class_zero) {
844         b.sign = sign;
845         return b;
846     }
847     g_assert_not_reached();
848 }
849 
850 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
851                                              float_status *status)
852 {
853     FloatParts pa = float16_unpack_canonical(a, status);
854     FloatParts pb = float16_unpack_canonical(b, status);
855     FloatParts pr = mul_floats(pa, pb, status);
856 
857     return float16_round_pack_canonical(pr, status);
858 }
859 
860 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
861                                              float_status *status)
862 {
863     FloatParts pa = float32_unpack_canonical(a, status);
864     FloatParts pb = float32_unpack_canonical(b, status);
865     FloatParts pr = mul_floats(pa, pb, status);
866 
867     return float32_round_pack_canonical(pr, status);
868 }
869 
870 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
871                                              float_status *status)
872 {
873     FloatParts pa = float64_unpack_canonical(a, status);
874     FloatParts pb = float64_unpack_canonical(b, status);
875     FloatParts pr = mul_floats(pa, pb, status);
876 
877     return float64_round_pack_canonical(pr, status);
878 }
879 
880 /*
881  * Returns the result of multiplying the floating-point values `a' and
882  * `b' then adding 'c', with no intermediate rounding step after the
883  * multiplication. The operation is performed according to the
884  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
885  * The flags argument allows the caller to select negation of the
886  * addend, the intermediate product, or the final result. (The
887  * difference between this and having the caller do a separate
888  * negation is that negating externally will flip the sign bit on
889  * NaNs.)
890  */
891 
892 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
893                                 int flags, float_status *s)
894 {
895     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
896                     ((1 << float_class_inf) | (1 << float_class_zero));
897     bool p_sign;
898     bool sign_flip = flags & float_muladd_negate_result;
899     FloatClass p_class;
900     uint64_t hi, lo;
901     int p_exp;
902 
903     /* It is implementation-defined whether the cases of (0,inf,qnan)
904      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
905      * they return if they do), so we have to hand this information
906      * off to the target-specific pick-a-NaN routine.
907      */
908     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
909         return pick_nan_muladd(a, b, c, inf_zero, s);
910     }
911 
912     if (inf_zero) {
913         s->float_exception_flags |= float_flag_invalid;
914         a.cls = float_class_dnan;
915         return a;
916     }
917 
918     if (flags & float_muladd_negate_c) {
919         c.sign ^= 1;
920     }
921 
922     p_sign = a.sign ^ b.sign;
923 
924     if (flags & float_muladd_negate_product) {
925         p_sign ^= 1;
926     }
927 
928     if (a.cls == float_class_inf || b.cls == float_class_inf) {
929         p_class = float_class_inf;
930     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
931         p_class = float_class_zero;
932     } else {
933         p_class = float_class_normal;
934     }
935 
936     if (c.cls == float_class_inf) {
937         if (p_class == float_class_inf && p_sign != c.sign) {
938             s->float_exception_flags |= float_flag_invalid;
939             a.cls = float_class_dnan;
940         } else {
941             a.cls = float_class_inf;
942             a.sign = c.sign ^ sign_flip;
943         }
944         return a;
945     }
946 
947     if (p_class == float_class_inf) {
948         a.cls = float_class_inf;
949         a.sign = p_sign ^ sign_flip;
950         return a;
951     }
952 
953     if (p_class == float_class_zero) {
954         if (c.cls == float_class_zero) {
955             if (p_sign != c.sign) {
956                 p_sign = s->float_rounding_mode == float_round_down;
957             }
958             c.sign = p_sign;
959         } else if (flags & float_muladd_halve_result) {
960             c.exp -= 1;
961         }
962         c.sign ^= sign_flip;
963         return c;
964     }
965 
966     /* a & b should be normals now... */
967     assert(a.cls == float_class_normal &&
968            b.cls == float_class_normal);
969 
970     p_exp = a.exp + b.exp;
971 
972     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
973      * result.
974      */
975     mul64To128(a.frac, b.frac, &hi, &lo);
976     /* binary point now at bit 124 */
977 
978     /* check for overflow */
979     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
980         shift128RightJamming(hi, lo, 1, &hi, &lo);
981         p_exp += 1;
982     }
983 
984     /* + add/sub */
985     if (c.cls == float_class_zero) {
986         /* move binary point back to 62 */
987         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
988     } else {
989         int exp_diff = p_exp - c.exp;
990         if (p_sign == c.sign) {
991             /* Addition */
992             if (exp_diff <= 0) {
993                 shift128RightJamming(hi, lo,
994                                      DECOMPOSED_BINARY_POINT - exp_diff,
995                                      &hi, &lo);
996                 lo += c.frac;
997                 p_exp = c.exp;
998             } else {
999                 uint64_t c_hi, c_lo;
1000                 /* shift c to the same binary point as the product (124) */
1001                 c_hi = c.frac >> 2;
1002                 c_lo = 0;
1003                 shift128RightJamming(c_hi, c_lo,
1004                                      exp_diff,
1005                                      &c_hi, &c_lo);
1006                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1007                 /* move binary point back to 62 */
1008                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1009             }
1010 
1011             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1012                 shift64RightJamming(lo, 1, &lo);
1013                 p_exp += 1;
1014             }
1015 
1016         } else {
1017             /* Subtraction */
1018             uint64_t c_hi, c_lo;
1019             /* make C binary point match product at bit 124 */
1020             c_hi = c.frac >> 2;
1021             c_lo = 0;
1022 
1023             if (exp_diff <= 0) {
1024                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1025                 if (exp_diff == 0
1026                     &&
1027                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1028                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1029                 } else {
1030                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1031                     p_sign ^= 1;
1032                     p_exp = c.exp;
1033                 }
1034             } else {
1035                 shift128RightJamming(c_hi, c_lo,
1036                                      exp_diff,
1037                                      &c_hi, &c_lo);
1038                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1039             }
1040 
1041             if (hi == 0 && lo == 0) {
1042                 a.cls = float_class_zero;
1043                 a.sign = s->float_rounding_mode == float_round_down;
1044                 a.sign ^= sign_flip;
1045                 return a;
1046             } else {
1047                 int shift;
1048                 if (hi != 0) {
1049                     shift = clz64(hi);
1050                 } else {
1051                     shift = clz64(lo) + 64;
1052                 }
1053                 /* Normalizing to a binary point of 124 is the
1054                    correct adjust for the exponent.  However since we're
1055                    shifting, we might as well put the binary point back
1056                    at 62 where we really want it.  Therefore shift as
1057                    if we're leaving 1 bit at the top of the word, but
1058                    adjust the exponent as if we're leaving 3 bits.  */
1059                 shift -= 1;
1060                 if (shift >= 64) {
1061                     lo = lo << (shift - 64);
1062                 } else {
1063                     hi = (hi << shift) | (lo >> (64 - shift));
1064                     lo = hi | ((lo << shift) != 0);
1065                 }
1066                 p_exp -= shift - 2;
1067             }
1068         }
1069     }
1070 
1071     if (flags & float_muladd_halve_result) {
1072         p_exp -= 1;
1073     }
1074 
1075     /* finally prepare our result */
1076     a.cls = float_class_normal;
1077     a.sign = p_sign ^ sign_flip;
1078     a.exp = p_exp;
1079     a.frac = lo;
1080 
1081     return a;
1082 }
1083 
1084 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1085                                                 int flags, float_status *status)
1086 {
1087     FloatParts pa = float16_unpack_canonical(a, status);
1088     FloatParts pb = float16_unpack_canonical(b, status);
1089     FloatParts pc = float16_unpack_canonical(c, status);
1090     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1091 
1092     return float16_round_pack_canonical(pr, status);
1093 }
1094 
1095 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1096                                                 int flags, float_status *status)
1097 {
1098     FloatParts pa = float32_unpack_canonical(a, status);
1099     FloatParts pb = float32_unpack_canonical(b, status);
1100     FloatParts pc = float32_unpack_canonical(c, status);
1101     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1102 
1103     return float32_round_pack_canonical(pr, status);
1104 }
1105 
1106 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1107                                                 int flags, float_status *status)
1108 {
1109     FloatParts pa = float64_unpack_canonical(a, status);
1110     FloatParts pb = float64_unpack_canonical(b, status);
1111     FloatParts pc = float64_unpack_canonical(c, status);
1112     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1113 
1114     return float64_round_pack_canonical(pr, status);
1115 }
1116 
1117 /*
1118  * Returns the result of dividing the floating-point value `a' by the
1119  * corresponding value `b'. The operation is performed according to
1120  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1121  */
1122 
1123 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1124 {
1125     bool sign = a.sign ^ b.sign;
1126 
1127     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1128         uint64_t temp_lo, temp_hi;
1129         int exp = a.exp - b.exp;
1130         if (a.frac < b.frac) {
1131             exp -= 1;
1132             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1133                               &temp_hi, &temp_lo);
1134         } else {
1135             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1136                               &temp_hi, &temp_lo);
1137         }
1138         /* LSB of quot is set if inexact which roundandpack will use
1139          * to set flags. Yet again we re-use a for the result */
1140         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1141         a.sign = sign;
1142         a.exp = exp;
1143         return a;
1144     }
1145     /* handle all the NaN cases */
1146     if (is_nan(a.cls) || is_nan(b.cls)) {
1147         return pick_nan(a, b, s);
1148     }
1149     /* 0/0 or Inf/Inf */
1150     if (a.cls == b.cls
1151         &&
1152         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1153         s->float_exception_flags |= float_flag_invalid;
1154         a.cls = float_class_dnan;
1155         return a;
1156     }
1157     /* Inf / x or 0 / x */
1158     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1159         a.sign = sign;
1160         return a;
1161     }
1162     /* Div 0 => Inf */
1163     if (b.cls == float_class_zero) {
1164         s->float_exception_flags |= float_flag_divbyzero;
1165         a.cls = float_class_inf;
1166         a.sign = sign;
1167         return a;
1168     }
1169     /* Div by Inf */
1170     if (b.cls == float_class_inf) {
1171         a.cls = float_class_zero;
1172         a.sign = sign;
1173         return a;
1174     }
1175     g_assert_not_reached();
1176 }
1177 
1178 float16 float16_div(float16 a, float16 b, float_status *status)
1179 {
1180     FloatParts pa = float16_unpack_canonical(a, status);
1181     FloatParts pb = float16_unpack_canonical(b, status);
1182     FloatParts pr = div_floats(pa, pb, status);
1183 
1184     return float16_round_pack_canonical(pr, status);
1185 }
1186 
1187 float32 float32_div(float32 a, float32 b, float_status *status)
1188 {
1189     FloatParts pa = float32_unpack_canonical(a, status);
1190     FloatParts pb = float32_unpack_canonical(b, status);
1191     FloatParts pr = div_floats(pa, pb, status);
1192 
1193     return float32_round_pack_canonical(pr, status);
1194 }
1195 
1196 float64 float64_div(float64 a, float64 b, float_status *status)
1197 {
1198     FloatParts pa = float64_unpack_canonical(a, status);
1199     FloatParts pb = float64_unpack_canonical(b, status);
1200     FloatParts pr = div_floats(pa, pb, status);
1201 
1202     return float64_round_pack_canonical(pr, status);
1203 }
1204 
1205 /*
1206  * Rounds the floating-point value `a' to an integer, and returns the
1207  * result as a floating-point value. The operation is performed
1208  * according to the IEC/IEEE Standard for Binary Floating-Point
1209  * Arithmetic.
1210  */
1211 
1212 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1213 {
1214     if (is_nan(a.cls)) {
1215         return return_nan(a, s);
1216     }
1217 
1218     switch (a.cls) {
1219     case float_class_zero:
1220     case float_class_inf:
1221     case float_class_qnan:
1222         /* already "integral" */
1223         break;
1224     case float_class_normal:
1225         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1226             /* already integral */
1227             break;
1228         }
1229         if (a.exp < 0) {
1230             bool one;
1231             /* all fractional */
1232             s->float_exception_flags |= float_flag_inexact;
1233             switch (rounding_mode) {
1234             case float_round_nearest_even:
1235                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1236                 break;
1237             case float_round_ties_away:
1238                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1239                 break;
1240             case float_round_to_zero:
1241                 one = false;
1242                 break;
1243             case float_round_up:
1244                 one = !a.sign;
1245                 break;
1246             case float_round_down:
1247                 one = a.sign;
1248                 break;
1249             default:
1250                 g_assert_not_reached();
1251             }
1252 
1253             if (one) {
1254                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1255                 a.exp = 0;
1256             } else {
1257                 a.cls = float_class_zero;
1258             }
1259         } else {
1260             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1261             uint64_t frac_lsbm1 = frac_lsb >> 1;
1262             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1263             uint64_t rnd_mask = rnd_even_mask >> 1;
1264             uint64_t inc;
1265 
1266             switch (rounding_mode) {
1267             case float_round_nearest_even:
1268                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1269                 break;
1270             case float_round_ties_away:
1271                 inc = frac_lsbm1;
1272                 break;
1273             case float_round_to_zero:
1274                 inc = 0;
1275                 break;
1276             case float_round_up:
1277                 inc = a.sign ? 0 : rnd_mask;
1278                 break;
1279             case float_round_down:
1280                 inc = a.sign ? rnd_mask : 0;
1281                 break;
1282             default:
1283                 g_assert_not_reached();
1284             }
1285 
1286             if (a.frac & rnd_mask) {
1287                 s->float_exception_flags |= float_flag_inexact;
1288                 a.frac += inc;
1289                 a.frac &= ~rnd_mask;
1290                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1291                     a.frac >>= 1;
1292                     a.exp++;
1293                 }
1294             }
1295         }
1296         break;
1297     default:
1298         g_assert_not_reached();
1299     }
1300     return a;
1301 }
1302 
1303 float16 float16_round_to_int(float16 a, float_status *s)
1304 {
1305     FloatParts pa = float16_unpack_canonical(a, s);
1306     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1307     return float16_round_pack_canonical(pr, s);
1308 }
1309 
1310 float32 float32_round_to_int(float32 a, float_status *s)
1311 {
1312     FloatParts pa = float32_unpack_canonical(a, s);
1313     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1314     return float32_round_pack_canonical(pr, s);
1315 }
1316 
1317 float64 float64_round_to_int(float64 a, float_status *s)
1318 {
1319     FloatParts pa = float64_unpack_canonical(a, s);
1320     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1321     return float64_round_pack_canonical(pr, s);
1322 }
1323 
1324 float64 float64_trunc_to_int(float64 a, float_status *s)
1325 {
1326     FloatParts pa = float64_unpack_canonical(a, s);
1327     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1328     return float64_round_pack_canonical(pr, s);
1329 }
1330 
1331 /*
1332  * Returns the result of converting the floating-point value `a' to
1333  * the two's complement integer format. The conversion is performed
1334  * according to the IEC/IEEE Standard for Binary Floating-Point
1335  * Arithmetic---which means in particular that the conversion is
1336  * rounded according to the current rounding mode. If `a' is a NaN,
1337  * the largest positive integer is returned. Otherwise, if the
1338  * conversion overflows, the largest integer with the same sign as `a'
1339  * is returned.
1340 */
1341 
1342 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1343                                      int64_t min, int64_t max,
1344                                      float_status *s)
1345 {
1346     uint64_t r;
1347     int orig_flags = get_float_exception_flags(s);
1348     FloatParts p = round_to_int(in, rmode, s);
1349 
1350     switch (p.cls) {
1351     case float_class_snan:
1352     case float_class_qnan:
1353     case float_class_dnan:
1354     case float_class_msnan:
1355         s->float_exception_flags = orig_flags | float_flag_invalid;
1356         return max;
1357     case float_class_inf:
1358         s->float_exception_flags = orig_flags | float_flag_invalid;
1359         return p.sign ? min : max;
1360     case float_class_zero:
1361         return 0;
1362     case float_class_normal:
1363         if (p.exp < DECOMPOSED_BINARY_POINT) {
1364             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1365         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1366             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1367         } else {
1368             r = UINT64_MAX;
1369         }
1370         if (p.sign) {
1371             if (r < -(uint64_t) min) {
1372                 return -r;
1373             } else {
1374                 s->float_exception_flags = orig_flags | float_flag_invalid;
1375                 return min;
1376             }
1377         } else {
1378             if (r < max) {
1379                 return r;
1380             } else {
1381                 s->float_exception_flags = orig_flags | float_flag_invalid;
1382                 return max;
1383             }
1384         }
1385     default:
1386         g_assert_not_reached();
1387     }
1388 }
1389 
1390 #define FLOAT_TO_INT(fsz, isz)                                          \
1391 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1392                                                 float_status *s)        \
1393 {                                                                       \
1394     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1395     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1396                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1397                                  s);                                    \
1398 }                                                                       \
1399                                                                         \
1400 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1401  (float ## fsz a, float_status *s)                                      \
1402 {                                                                       \
1403     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1404     return round_to_int_and_pack(p, float_round_to_zero,                \
1405                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1406                                  s);                                    \
1407 }
1408 
1409 FLOAT_TO_INT(16, 16)
1410 FLOAT_TO_INT(16, 32)
1411 FLOAT_TO_INT(16, 64)
1412 
1413 FLOAT_TO_INT(32, 16)
1414 FLOAT_TO_INT(32, 32)
1415 FLOAT_TO_INT(32, 64)
1416 
1417 FLOAT_TO_INT(64, 16)
1418 FLOAT_TO_INT(64, 32)
1419 FLOAT_TO_INT(64, 64)
1420 
1421 #undef FLOAT_TO_INT
1422 
1423 /*
1424  *  Returns the result of converting the floating-point value `a' to
1425  *  the unsigned integer format. The conversion is performed according
1426  *  to the IEC/IEEE Standard for Binary Floating-Point
1427  *  Arithmetic---which means in particular that the conversion is
1428  *  rounded according to the current rounding mode. If `a' is a NaN,
1429  *  the largest unsigned integer is returned. Otherwise, if the
1430  *  conversion overflows, the largest unsigned integer is returned. If
1431  *  the 'a' is negative, the result is rounded and zero is returned;
1432  *  values that do not round to zero will raise the inexact exception
1433  *  flag.
1434  */
1435 
1436 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1437                                        float_status *s)
1438 {
1439     int orig_flags = get_float_exception_flags(s);
1440     FloatParts p = round_to_int(in, rmode, s);
1441 
1442     switch (p.cls) {
1443     case float_class_snan:
1444     case float_class_qnan:
1445     case float_class_dnan:
1446     case float_class_msnan:
1447         s->float_exception_flags = orig_flags | float_flag_invalid;
1448         return max;
1449     case float_class_inf:
1450         s->float_exception_flags = orig_flags | float_flag_invalid;
1451         return p.sign ? 0 : max;
1452     case float_class_zero:
1453         return 0;
1454     case float_class_normal:
1455     {
1456         uint64_t r;
1457         if (p.sign) {
1458             s->float_exception_flags = orig_flags | float_flag_invalid;
1459             return 0;
1460         }
1461 
1462         if (p.exp < DECOMPOSED_BINARY_POINT) {
1463             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1464         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1465             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1466         } else {
1467             s->float_exception_flags = orig_flags | float_flag_invalid;
1468             return max;
1469         }
1470 
1471         /* For uint64 this will never trip, but if p.exp is too large
1472          * to shift a decomposed fraction we shall have exited via the
1473          * 3rd leg above.
1474          */
1475         if (r > max) {
1476             s->float_exception_flags = orig_flags | float_flag_invalid;
1477             return max;
1478         } else {
1479             return r;
1480         }
1481     }
1482     default:
1483         g_assert_not_reached();
1484     }
1485 }
1486 
1487 #define FLOAT_TO_UINT(fsz, isz) \
1488 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1489                                                   float_status *s)      \
1490 {                                                                       \
1491     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1492     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1493                                  UINT ## isz ## _MAX, s);               \
1494 }                                                                       \
1495                                                                         \
1496 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1497  (float ## fsz a, float_status *s)                                      \
1498 {                                                                       \
1499     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1500     return round_to_uint_and_pack(p, float_round_to_zero,               \
1501                                   UINT ## isz ## _MAX, s);              \
1502 }
1503 
1504 FLOAT_TO_UINT(16, 16)
1505 FLOAT_TO_UINT(16, 32)
1506 FLOAT_TO_UINT(16, 64)
1507 
1508 FLOAT_TO_UINT(32, 16)
1509 FLOAT_TO_UINT(32, 32)
1510 FLOAT_TO_UINT(32, 64)
1511 
1512 FLOAT_TO_UINT(64, 16)
1513 FLOAT_TO_UINT(64, 32)
1514 FLOAT_TO_UINT(64, 64)
1515 
1516 #undef FLOAT_TO_UINT
1517 
1518 /*
1519  * Integer to float conversions
1520  *
1521  * Returns the result of converting the two's complement integer `a'
1522  * to the floating-point format. The conversion is performed according
1523  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524  */
1525 
1526 static FloatParts int_to_float(int64_t a, float_status *status)
1527 {
1528     FloatParts r;
1529     if (a == 0) {
1530         r.cls = float_class_zero;
1531         r.sign = false;
1532     } else if (a == (1ULL << 63)) {
1533         r.cls = float_class_normal;
1534         r.sign = true;
1535         r.frac = DECOMPOSED_IMPLICIT_BIT;
1536         r.exp = 63;
1537     } else {
1538         uint64_t f;
1539         if (a < 0) {
1540             f = -a;
1541             r.sign = true;
1542         } else {
1543             f = a;
1544             r.sign = false;
1545         }
1546         int shift = clz64(f) - 1;
1547         r.cls = float_class_normal;
1548         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1549         r.frac = f << shift;
1550     }
1551 
1552     return r;
1553 }
1554 
1555 float16 int64_to_float16(int64_t a, float_status *status)
1556 {
1557     FloatParts pa = int_to_float(a, status);
1558     return float16_round_pack_canonical(pa, status);
1559 }
1560 
1561 float16 int32_to_float16(int32_t a, float_status *status)
1562 {
1563     return int64_to_float16(a, status);
1564 }
1565 
1566 float16 int16_to_float16(int16_t a, float_status *status)
1567 {
1568     return int64_to_float16(a, status);
1569 }
1570 
1571 float32 int64_to_float32(int64_t a, float_status *status)
1572 {
1573     FloatParts pa = int_to_float(a, status);
1574     return float32_round_pack_canonical(pa, status);
1575 }
1576 
1577 float32 int32_to_float32(int32_t a, float_status *status)
1578 {
1579     return int64_to_float32(a, status);
1580 }
1581 
1582 float32 int16_to_float32(int16_t a, float_status *status)
1583 {
1584     return int64_to_float32(a, status);
1585 }
1586 
1587 float64 int64_to_float64(int64_t a, float_status *status)
1588 {
1589     FloatParts pa = int_to_float(a, status);
1590     return float64_round_pack_canonical(pa, status);
1591 }
1592 
1593 float64 int32_to_float64(int32_t a, float_status *status)
1594 {
1595     return int64_to_float64(a, status);
1596 }
1597 
1598 float64 int16_to_float64(int16_t a, float_status *status)
1599 {
1600     return int64_to_float64(a, status);
1601 }
1602 
1603 
1604 /*
1605  * Unsigned Integer to float conversions
1606  *
1607  * Returns the result of converting the unsigned integer `a' to the
1608  * floating-point format. The conversion is performed according to the
1609  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1610  */
1611 
1612 static FloatParts uint_to_float(uint64_t a, float_status *status)
1613 {
1614     FloatParts r = { .sign = false};
1615 
1616     if (a == 0) {
1617         r.cls = float_class_zero;
1618     } else {
1619         int spare_bits = clz64(a) - 1;
1620         r.cls = float_class_normal;
1621         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1622         if (spare_bits < 0) {
1623             shift64RightJamming(a, -spare_bits, &a);
1624             r.frac = a;
1625         } else {
1626             r.frac = a << spare_bits;
1627         }
1628     }
1629 
1630     return r;
1631 }
1632 
1633 float16 uint64_to_float16(uint64_t a, float_status *status)
1634 {
1635     FloatParts pa = uint_to_float(a, status);
1636     return float16_round_pack_canonical(pa, status);
1637 }
1638 
1639 float16 uint32_to_float16(uint32_t a, float_status *status)
1640 {
1641     return uint64_to_float16(a, status);
1642 }
1643 
1644 float16 uint16_to_float16(uint16_t a, float_status *status)
1645 {
1646     return uint64_to_float16(a, status);
1647 }
1648 
1649 float32 uint64_to_float32(uint64_t a, float_status *status)
1650 {
1651     FloatParts pa = uint_to_float(a, status);
1652     return float32_round_pack_canonical(pa, status);
1653 }
1654 
1655 float32 uint32_to_float32(uint32_t a, float_status *status)
1656 {
1657     return uint64_to_float32(a, status);
1658 }
1659 
1660 float32 uint16_to_float32(uint16_t a, float_status *status)
1661 {
1662     return uint64_to_float32(a, status);
1663 }
1664 
1665 float64 uint64_to_float64(uint64_t a, float_status *status)
1666 {
1667     FloatParts pa = uint_to_float(a, status);
1668     return float64_round_pack_canonical(pa, status);
1669 }
1670 
1671 float64 uint32_to_float64(uint32_t a, float_status *status)
1672 {
1673     return uint64_to_float64(a, status);
1674 }
1675 
1676 float64 uint16_to_float64(uint16_t a, float_status *status)
1677 {
1678     return uint64_to_float64(a, status);
1679 }
1680 
1681 /* Float Min/Max */
1682 /* min() and max() functions. These can't be implemented as
1683  * 'compare and pick one input' because that would mishandle
1684  * NaNs and +0 vs -0.
1685  *
1686  * minnum() and maxnum() functions. These are similar to the min()
1687  * and max() functions but if one of the arguments is a QNaN and
1688  * the other is numerical then the numerical argument is returned.
1689  * SNaNs will get quietened before being returned.
1690  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1691  * and maxNum() operations. min() and max() are the typical min/max
1692  * semantics provided by many CPUs which predate that specification.
1693  *
1694  * minnummag() and maxnummag() functions correspond to minNumMag()
1695  * and minNumMag() from the IEEE-754 2008.
1696  */
1697 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1698                                 bool ieee, bool ismag, float_status *s)
1699 {
1700     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1701         if (ieee) {
1702             /* Takes two floating-point values `a' and `b', one of
1703              * which is a NaN, and returns the appropriate NaN
1704              * result. If either `a' or `b' is a signaling NaN,
1705              * the invalid exception is raised.
1706              */
1707             if (is_snan(a.cls) || is_snan(b.cls)) {
1708                 return pick_nan(a, b, s);
1709             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1710                 return b;
1711             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1712                 return a;
1713             }
1714         }
1715         return pick_nan(a, b, s);
1716     } else {
1717         int a_exp, b_exp;
1718 
1719         switch (a.cls) {
1720         case float_class_normal:
1721             a_exp = a.exp;
1722             break;
1723         case float_class_inf:
1724             a_exp = INT_MAX;
1725             break;
1726         case float_class_zero:
1727             a_exp = INT_MIN;
1728             break;
1729         default:
1730             g_assert_not_reached();
1731             break;
1732         }
1733         switch (b.cls) {
1734         case float_class_normal:
1735             b_exp = b.exp;
1736             break;
1737         case float_class_inf:
1738             b_exp = INT_MAX;
1739             break;
1740         case float_class_zero:
1741             b_exp = INT_MIN;
1742             break;
1743         default:
1744             g_assert_not_reached();
1745             break;
1746         }
1747 
1748         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1749             bool a_less = a_exp < b_exp;
1750             if (a_exp == b_exp) {
1751                 a_less = a.frac < b.frac;
1752             }
1753             return a_less ^ ismin ? b : a;
1754         }
1755 
1756         if (a.sign == b.sign) {
1757             bool a_less = a_exp < b_exp;
1758             if (a_exp == b_exp) {
1759                 a_less = a.frac < b.frac;
1760             }
1761             return a.sign ^ a_less ^ ismin ? b : a;
1762         } else {
1763             return a.sign ^ ismin ? b : a;
1764         }
1765     }
1766 }
1767 
1768 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1769 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1770                                      float_status *s)                   \
1771 {                                                                       \
1772     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1773     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1774     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1775                                                                         \
1776     return float ## sz ## _round_pack_canonical(pr, s);                 \
1777 }
1778 
1779 MINMAX(16, min, true, false, false)
1780 MINMAX(16, minnum, true, true, false)
1781 MINMAX(16, minnummag, true, true, true)
1782 MINMAX(16, max, false, false, false)
1783 MINMAX(16, maxnum, false, true, false)
1784 MINMAX(16, maxnummag, false, true, true)
1785 
1786 MINMAX(32, min, true, false, false)
1787 MINMAX(32, minnum, true, true, false)
1788 MINMAX(32, minnummag, true, true, true)
1789 MINMAX(32, max, false, false, false)
1790 MINMAX(32, maxnum, false, true, false)
1791 MINMAX(32, maxnummag, false, true, true)
1792 
1793 MINMAX(64, min, true, false, false)
1794 MINMAX(64, minnum, true, true, false)
1795 MINMAX(64, minnummag, true, true, true)
1796 MINMAX(64, max, false, false, false)
1797 MINMAX(64, maxnum, false, true, false)
1798 MINMAX(64, maxnummag, false, true, true)
1799 
1800 #undef MINMAX
1801 
1802 /* Floating point compare */
1803 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1804                           float_status *s)
1805 {
1806     if (is_nan(a.cls) || is_nan(b.cls)) {
1807         if (!is_quiet ||
1808             a.cls == float_class_snan ||
1809             b.cls == float_class_snan) {
1810             s->float_exception_flags |= float_flag_invalid;
1811         }
1812         return float_relation_unordered;
1813     }
1814 
1815     if (a.cls == float_class_zero) {
1816         if (b.cls == float_class_zero) {
1817             return float_relation_equal;
1818         }
1819         return b.sign ? float_relation_greater : float_relation_less;
1820     } else if (b.cls == float_class_zero) {
1821         return a.sign ? float_relation_less : float_relation_greater;
1822     }
1823 
1824     /* The only really important thing about infinity is its sign. If
1825      * both are infinities the sign marks the smallest of the two.
1826      */
1827     if (a.cls == float_class_inf) {
1828         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1829             return float_relation_equal;
1830         }
1831         return a.sign ? float_relation_less : float_relation_greater;
1832     } else if (b.cls == float_class_inf) {
1833         return b.sign ? float_relation_greater : float_relation_less;
1834     }
1835 
1836     if (a.sign != b.sign) {
1837         return a.sign ? float_relation_less : float_relation_greater;
1838     }
1839 
1840     if (a.exp == b.exp) {
1841         if (a.frac == b.frac) {
1842             return float_relation_equal;
1843         }
1844         if (a.sign) {
1845             return a.frac > b.frac ?
1846                 float_relation_less : float_relation_greater;
1847         } else {
1848             return a.frac > b.frac ?
1849                 float_relation_greater : float_relation_less;
1850         }
1851     } else {
1852         if (a.sign) {
1853             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1854         } else {
1855             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1856         }
1857     }
1858 }
1859 
1860 #define COMPARE(sz)                                                     \
1861 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1862                             float_status *s)                            \
1863 {                                                                       \
1864     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1865     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1866     return compare_floats(pa, pb, false, s);                            \
1867 }                                                                       \
1868 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1869                                   float_status *s)                      \
1870 {                                                                       \
1871     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1872     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1873     return compare_floats(pa, pb, true, s);                             \
1874 }
1875 
1876 COMPARE(16)
1877 COMPARE(32)
1878 COMPARE(64)
1879 
1880 #undef COMPARE
1881 
1882 /* Multiply A by 2 raised to the power N.  */
1883 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1884 {
1885     if (unlikely(is_nan(a.cls))) {
1886         return return_nan(a, s);
1887     }
1888     if (a.cls == float_class_normal) {
1889         /* The largest float type (even though not supported by FloatParts)
1890          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1891          * still allows rounding to infinity, without allowing overflow
1892          * within the int32_t that backs FloatParts.exp.
1893          */
1894         n = MIN(MAX(n, -0x10000), 0x10000);
1895         a.exp += n;
1896     }
1897     return a;
1898 }
1899 
1900 float16 float16_scalbn(float16 a, int n, float_status *status)
1901 {
1902     FloatParts pa = float16_unpack_canonical(a, status);
1903     FloatParts pr = scalbn_decomposed(pa, n, status);
1904     return float16_round_pack_canonical(pr, status);
1905 }
1906 
1907 float32 float32_scalbn(float32 a, int n, float_status *status)
1908 {
1909     FloatParts pa = float32_unpack_canonical(a, status);
1910     FloatParts pr = scalbn_decomposed(pa, n, status);
1911     return float32_round_pack_canonical(pr, status);
1912 }
1913 
1914 float64 float64_scalbn(float64 a, int n, float_status *status)
1915 {
1916     FloatParts pa = float64_unpack_canonical(a, status);
1917     FloatParts pr = scalbn_decomposed(pa, n, status);
1918     return float64_round_pack_canonical(pr, status);
1919 }
1920 
1921 /*
1922  * Square Root
1923  *
1924  * The old softfloat code did an approximation step before zeroing in
1925  * on the final result. However for simpleness we just compute the
1926  * square root by iterating down from the implicit bit to enough extra
1927  * bits to ensure we get a correctly rounded result.
1928  *
1929  * This does mean however the calculation is slower than before,
1930  * especially for 64 bit floats.
1931  */
1932 
1933 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
1934 {
1935     uint64_t a_frac, r_frac, s_frac;
1936     int bit, last_bit;
1937 
1938     if (is_nan(a.cls)) {
1939         return return_nan(a, s);
1940     }
1941     if (a.cls == float_class_zero) {
1942         return a;  /* sqrt(+-0) = +-0 */
1943     }
1944     if (a.sign) {
1945         s->float_exception_flags |= float_flag_invalid;
1946         a.cls = float_class_dnan;
1947         return a;
1948     }
1949     if (a.cls == float_class_inf) {
1950         return a;  /* sqrt(+inf) = +inf */
1951     }
1952 
1953     assert(a.cls == float_class_normal);
1954 
1955     /* We need two overflow bits at the top. Adding room for that is a
1956      * right shift. If the exponent is odd, we can discard the low bit
1957      * by multiplying the fraction by 2; that's a left shift. Combine
1958      * those and we shift right if the exponent is even.
1959      */
1960     a_frac = a.frac;
1961     if (!(a.exp & 1)) {
1962         a_frac >>= 1;
1963     }
1964     a.exp >>= 1;
1965 
1966     /* Bit-by-bit computation of sqrt.  */
1967     r_frac = 0;
1968     s_frac = 0;
1969 
1970     /* Iterate from implicit bit down to the 3 extra bits to compute a
1971      * properly rounded result. Remember we've inserted one more bit
1972      * at the top, so these positions are one less.
1973      */
1974     bit = DECOMPOSED_BINARY_POINT - 1;
1975     last_bit = MAX(p->frac_shift - 4, 0);
1976     do {
1977         uint64_t q = 1ULL << bit;
1978         uint64_t t_frac = s_frac + q;
1979         if (t_frac <= a_frac) {
1980             s_frac = t_frac + q;
1981             a_frac -= t_frac;
1982             r_frac += q;
1983         }
1984         a_frac <<= 1;
1985     } while (--bit >= last_bit);
1986 
1987     /* Undo the right shift done above. If there is any remaining
1988      * fraction, the result is inexact. Set the sticky bit.
1989      */
1990     a.frac = (r_frac << 1) + (a_frac != 0);
1991 
1992     return a;
1993 }
1994 
1995 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
1996 {
1997     FloatParts pa = float16_unpack_canonical(a, status);
1998     FloatParts pr = sqrt_float(pa, status, &float16_params);
1999     return float16_round_pack_canonical(pr, status);
2000 }
2001 
2002 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2003 {
2004     FloatParts pa = float32_unpack_canonical(a, status);
2005     FloatParts pr = sqrt_float(pa, status, &float32_params);
2006     return float32_round_pack_canonical(pr, status);
2007 }
2008 
2009 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2010 {
2011     FloatParts pa = float64_unpack_canonical(a, status);
2012     FloatParts pr = sqrt_float(pa, status, &float64_params);
2013     return float64_round_pack_canonical(pr, status);
2014 }
2015 
2016 
2017 /*----------------------------------------------------------------------------
2018 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2019 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2020 | input.  If `zSign' is 1, the input is negated before being converted to an
2021 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2022 | is simply rounded to an integer, with the inexact exception raised if the
2023 | input cannot be represented exactly as an integer.  However, if the fixed-
2024 | point input is too large, the invalid exception is raised and the largest
2025 | positive or negative integer is returned.
2026 *----------------------------------------------------------------------------*/
2027 
2028 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2029 {
2030     int8_t roundingMode;
2031     flag roundNearestEven;
2032     int8_t roundIncrement, roundBits;
2033     int32_t z;
2034 
2035     roundingMode = status->float_rounding_mode;
2036     roundNearestEven = ( roundingMode == float_round_nearest_even );
2037     switch (roundingMode) {
2038     case float_round_nearest_even:
2039     case float_round_ties_away:
2040         roundIncrement = 0x40;
2041         break;
2042     case float_round_to_zero:
2043         roundIncrement = 0;
2044         break;
2045     case float_round_up:
2046         roundIncrement = zSign ? 0 : 0x7f;
2047         break;
2048     case float_round_down:
2049         roundIncrement = zSign ? 0x7f : 0;
2050         break;
2051     default:
2052         abort();
2053     }
2054     roundBits = absZ & 0x7F;
2055     absZ = ( absZ + roundIncrement )>>7;
2056     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2057     z = absZ;
2058     if ( zSign ) z = - z;
2059     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2060         float_raise(float_flag_invalid, status);
2061         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2062     }
2063     if (roundBits) {
2064         status->float_exception_flags |= float_flag_inexact;
2065     }
2066     return z;
2067 
2068 }
2069 
2070 /*----------------------------------------------------------------------------
2071 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2072 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2073 | and returns the properly rounded 64-bit integer corresponding to the input.
2074 | If `zSign' is 1, the input is negated before being converted to an integer.
2075 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2076 | the inexact exception raised if the input cannot be represented exactly as
2077 | an integer.  However, if the fixed-point input is too large, the invalid
2078 | exception is raised and the largest positive or negative integer is
2079 | returned.
2080 *----------------------------------------------------------------------------*/
2081 
2082 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2083                                float_status *status)
2084 {
2085     int8_t roundingMode;
2086     flag roundNearestEven, increment;
2087     int64_t z;
2088 
2089     roundingMode = status->float_rounding_mode;
2090     roundNearestEven = ( roundingMode == float_round_nearest_even );
2091     switch (roundingMode) {
2092     case float_round_nearest_even:
2093     case float_round_ties_away:
2094         increment = ((int64_t) absZ1 < 0);
2095         break;
2096     case float_round_to_zero:
2097         increment = 0;
2098         break;
2099     case float_round_up:
2100         increment = !zSign && absZ1;
2101         break;
2102     case float_round_down:
2103         increment = zSign && absZ1;
2104         break;
2105     default:
2106         abort();
2107     }
2108     if ( increment ) {
2109         ++absZ0;
2110         if ( absZ0 == 0 ) goto overflow;
2111         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2112     }
2113     z = absZ0;
2114     if ( zSign ) z = - z;
2115     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2116  overflow:
2117         float_raise(float_flag_invalid, status);
2118         return
2119               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2120             : LIT64( 0x7FFFFFFFFFFFFFFF );
2121     }
2122     if (absZ1) {
2123         status->float_exception_flags |= float_flag_inexact;
2124     }
2125     return z;
2126 
2127 }
2128 
2129 /*----------------------------------------------------------------------------
2130 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2131 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2132 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2133 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2134 | with the inexact exception raised if the input cannot be represented exactly
2135 | as an integer.  However, if the fixed-point input is too large, the invalid
2136 | exception is raised and the largest unsigned integer is returned.
2137 *----------------------------------------------------------------------------*/
2138 
2139 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2140                                 uint64_t absZ1, float_status *status)
2141 {
2142     int8_t roundingMode;
2143     flag roundNearestEven, increment;
2144 
2145     roundingMode = status->float_rounding_mode;
2146     roundNearestEven = (roundingMode == float_round_nearest_even);
2147     switch (roundingMode) {
2148     case float_round_nearest_even:
2149     case float_round_ties_away:
2150         increment = ((int64_t)absZ1 < 0);
2151         break;
2152     case float_round_to_zero:
2153         increment = 0;
2154         break;
2155     case float_round_up:
2156         increment = !zSign && absZ1;
2157         break;
2158     case float_round_down:
2159         increment = zSign && absZ1;
2160         break;
2161     default:
2162         abort();
2163     }
2164     if (increment) {
2165         ++absZ0;
2166         if (absZ0 == 0) {
2167             float_raise(float_flag_invalid, status);
2168             return LIT64(0xFFFFFFFFFFFFFFFF);
2169         }
2170         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2171     }
2172 
2173     if (zSign && absZ0) {
2174         float_raise(float_flag_invalid, status);
2175         return 0;
2176     }
2177 
2178     if (absZ1) {
2179         status->float_exception_flags |= float_flag_inexact;
2180     }
2181     return absZ0;
2182 }
2183 
2184 /*----------------------------------------------------------------------------
2185 | If `a' is denormal and we are in flush-to-zero mode then set the
2186 | input-denormal exception and return zero. Otherwise just return the value.
2187 *----------------------------------------------------------------------------*/
2188 float32 float32_squash_input_denormal(float32 a, float_status *status)
2189 {
2190     if (status->flush_inputs_to_zero) {
2191         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2192             float_raise(float_flag_input_denormal, status);
2193             return make_float32(float32_val(a) & 0x80000000);
2194         }
2195     }
2196     return a;
2197 }
2198 
2199 /*----------------------------------------------------------------------------
2200 | Normalizes the subnormal single-precision floating-point value represented
2201 | by the denormalized significand `aSig'.  The normalized exponent and
2202 | significand are stored at the locations pointed to by `zExpPtr' and
2203 | `zSigPtr', respectively.
2204 *----------------------------------------------------------------------------*/
2205 
2206 static void
2207  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2208 {
2209     int8_t shiftCount;
2210 
2211     shiftCount = countLeadingZeros32( aSig ) - 8;
2212     *zSigPtr = aSig<<shiftCount;
2213     *zExpPtr = 1 - shiftCount;
2214 
2215 }
2216 
2217 /*----------------------------------------------------------------------------
2218 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2219 | and significand `zSig', and returns the proper single-precision floating-
2220 | point value corresponding to the abstract input.  Ordinarily, the abstract
2221 | value is simply rounded and packed into the single-precision format, with
2222 | the inexact exception raised if the abstract input cannot be represented
2223 | exactly.  However, if the abstract value is too large, the overflow and
2224 | inexact exceptions are raised and an infinity or maximal finite value is
2225 | returned.  If the abstract value is too small, the input value is rounded to
2226 | a subnormal number, and the underflow and inexact exceptions are raised if
2227 | the abstract input cannot be represented exactly as a subnormal single-
2228 | precision floating-point number.
2229 |     The input significand `zSig' has its binary point between bits 30
2230 | and 29, which is 7 bits to the left of the usual location.  This shifted
2231 | significand must be normalized or smaller.  If `zSig' is not normalized,
2232 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2233 | and it must not require rounding.  In the usual case that `zSig' is
2234 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2235 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2236 | Binary Floating-Point Arithmetic.
2237 *----------------------------------------------------------------------------*/
2238 
2239 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2240                                    float_status *status)
2241 {
2242     int8_t roundingMode;
2243     flag roundNearestEven;
2244     int8_t roundIncrement, roundBits;
2245     flag isTiny;
2246 
2247     roundingMode = status->float_rounding_mode;
2248     roundNearestEven = ( roundingMode == float_round_nearest_even );
2249     switch (roundingMode) {
2250     case float_round_nearest_even:
2251     case float_round_ties_away:
2252         roundIncrement = 0x40;
2253         break;
2254     case float_round_to_zero:
2255         roundIncrement = 0;
2256         break;
2257     case float_round_up:
2258         roundIncrement = zSign ? 0 : 0x7f;
2259         break;
2260     case float_round_down:
2261         roundIncrement = zSign ? 0x7f : 0;
2262         break;
2263     default:
2264         abort();
2265         break;
2266     }
2267     roundBits = zSig & 0x7F;
2268     if ( 0xFD <= (uint16_t) zExp ) {
2269         if (    ( 0xFD < zExp )
2270              || (    ( zExp == 0xFD )
2271                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2272            ) {
2273             float_raise(float_flag_overflow | float_flag_inexact, status);
2274             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2275         }
2276         if ( zExp < 0 ) {
2277             if (status->flush_to_zero) {
2278                 float_raise(float_flag_output_denormal, status);
2279                 return packFloat32(zSign, 0, 0);
2280             }
2281             isTiny =
2282                 (status->float_detect_tininess
2283                  == float_tininess_before_rounding)
2284                 || ( zExp < -1 )
2285                 || ( zSig + roundIncrement < 0x80000000 );
2286             shift32RightJamming( zSig, - zExp, &zSig );
2287             zExp = 0;
2288             roundBits = zSig & 0x7F;
2289             if (isTiny && roundBits) {
2290                 float_raise(float_flag_underflow, status);
2291             }
2292         }
2293     }
2294     if (roundBits) {
2295         status->float_exception_flags |= float_flag_inexact;
2296     }
2297     zSig = ( zSig + roundIncrement )>>7;
2298     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2299     if ( zSig == 0 ) zExp = 0;
2300     return packFloat32( zSign, zExp, zSig );
2301 
2302 }
2303 
2304 /*----------------------------------------------------------------------------
2305 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2306 | and significand `zSig', and returns the proper single-precision floating-
2307 | point value corresponding to the abstract input.  This routine is just like
2308 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2309 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2310 | floating-point exponent.
2311 *----------------------------------------------------------------------------*/
2312 
2313 static float32
2314  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2315                               float_status *status)
2316 {
2317     int8_t shiftCount;
2318 
2319     shiftCount = countLeadingZeros32( zSig ) - 1;
2320     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2321                                status);
2322 
2323 }
2324 
2325 /*----------------------------------------------------------------------------
2326 | If `a' is denormal and we are in flush-to-zero mode then set the
2327 | input-denormal exception and return zero. Otherwise just return the value.
2328 *----------------------------------------------------------------------------*/
2329 float64 float64_squash_input_denormal(float64 a, float_status *status)
2330 {
2331     if (status->flush_inputs_to_zero) {
2332         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2333             float_raise(float_flag_input_denormal, status);
2334             return make_float64(float64_val(a) & (1ULL << 63));
2335         }
2336     }
2337     return a;
2338 }
2339 
2340 /*----------------------------------------------------------------------------
2341 | Normalizes the subnormal double-precision floating-point value represented
2342 | by the denormalized significand `aSig'.  The normalized exponent and
2343 | significand are stored at the locations pointed to by `zExpPtr' and
2344 | `zSigPtr', respectively.
2345 *----------------------------------------------------------------------------*/
2346 
2347 static void
2348  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2349 {
2350     int8_t shiftCount;
2351 
2352     shiftCount = countLeadingZeros64( aSig ) - 11;
2353     *zSigPtr = aSig<<shiftCount;
2354     *zExpPtr = 1 - shiftCount;
2355 
2356 }
2357 
2358 /*----------------------------------------------------------------------------
2359 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2360 | double-precision floating-point value, returning the result.  After being
2361 | shifted into the proper positions, the three fields are simply added
2362 | together to form the result.  This means that any integer portion of `zSig'
2363 | will be added into the exponent.  Since a properly normalized significand
2364 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2365 | than the desired result exponent whenever `zSig' is a complete, normalized
2366 | significand.
2367 *----------------------------------------------------------------------------*/
2368 
2369 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2370 {
2371 
2372     return make_float64(
2373         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2374 
2375 }
2376 
2377 /*----------------------------------------------------------------------------
2378 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2379 | and significand `zSig', and returns the proper double-precision floating-
2380 | point value corresponding to the abstract input.  Ordinarily, the abstract
2381 | value is simply rounded and packed into the double-precision format, with
2382 | the inexact exception raised if the abstract input cannot be represented
2383 | exactly.  However, if the abstract value is too large, the overflow and
2384 | inexact exceptions are raised and an infinity or maximal finite value is
2385 | returned.  If the abstract value is too small, the input value is rounded to
2386 | a subnormal number, and the underflow and inexact exceptions are raised if
2387 | the abstract input cannot be represented exactly as a subnormal double-
2388 | precision floating-point number.
2389 |     The input significand `zSig' has its binary point between bits 62
2390 | and 61, which is 10 bits to the left of the usual location.  This shifted
2391 | significand must be normalized or smaller.  If `zSig' is not normalized,
2392 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2393 | and it must not require rounding.  In the usual case that `zSig' is
2394 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2395 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2396 | Binary Floating-Point Arithmetic.
2397 *----------------------------------------------------------------------------*/
2398 
2399 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2400                                    float_status *status)
2401 {
2402     int8_t roundingMode;
2403     flag roundNearestEven;
2404     int roundIncrement, roundBits;
2405     flag isTiny;
2406 
2407     roundingMode = status->float_rounding_mode;
2408     roundNearestEven = ( roundingMode == float_round_nearest_even );
2409     switch (roundingMode) {
2410     case float_round_nearest_even:
2411     case float_round_ties_away:
2412         roundIncrement = 0x200;
2413         break;
2414     case float_round_to_zero:
2415         roundIncrement = 0;
2416         break;
2417     case float_round_up:
2418         roundIncrement = zSign ? 0 : 0x3ff;
2419         break;
2420     case float_round_down:
2421         roundIncrement = zSign ? 0x3ff : 0;
2422         break;
2423     case float_round_to_odd:
2424         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2425         break;
2426     default:
2427         abort();
2428     }
2429     roundBits = zSig & 0x3FF;
2430     if ( 0x7FD <= (uint16_t) zExp ) {
2431         if (    ( 0x7FD < zExp )
2432              || (    ( zExp == 0x7FD )
2433                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2434            ) {
2435             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2436                                    roundIncrement != 0;
2437             float_raise(float_flag_overflow | float_flag_inexact, status);
2438             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2439         }
2440         if ( zExp < 0 ) {
2441             if (status->flush_to_zero) {
2442                 float_raise(float_flag_output_denormal, status);
2443                 return packFloat64(zSign, 0, 0);
2444             }
2445             isTiny =
2446                    (status->float_detect_tininess
2447                     == float_tininess_before_rounding)
2448                 || ( zExp < -1 )
2449                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2450             shift64RightJamming( zSig, - zExp, &zSig );
2451             zExp = 0;
2452             roundBits = zSig & 0x3FF;
2453             if (isTiny && roundBits) {
2454                 float_raise(float_flag_underflow, status);
2455             }
2456             if (roundingMode == float_round_to_odd) {
2457                 /*
2458                  * For round-to-odd case, the roundIncrement depends on
2459                  * zSig which just changed.
2460                  */
2461                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2462             }
2463         }
2464     }
2465     if (roundBits) {
2466         status->float_exception_flags |= float_flag_inexact;
2467     }
2468     zSig = ( zSig + roundIncrement )>>10;
2469     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2470     if ( zSig == 0 ) zExp = 0;
2471     return packFloat64( zSign, zExp, zSig );
2472 
2473 }
2474 
2475 /*----------------------------------------------------------------------------
2476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2477 | and significand `zSig', and returns the proper double-precision floating-
2478 | point value corresponding to the abstract input.  This routine is just like
2479 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2480 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2481 | floating-point exponent.
2482 *----------------------------------------------------------------------------*/
2483 
2484 static float64
2485  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2486                               float_status *status)
2487 {
2488     int8_t shiftCount;
2489 
2490     shiftCount = countLeadingZeros64( zSig ) - 1;
2491     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2492                                status);
2493 
2494 }
2495 
2496 /*----------------------------------------------------------------------------
2497 | Normalizes the subnormal extended double-precision floating-point value
2498 | represented by the denormalized significand `aSig'.  The normalized exponent
2499 | and significand are stored at the locations pointed to by `zExpPtr' and
2500 | `zSigPtr', respectively.
2501 *----------------------------------------------------------------------------*/
2502 
2503 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2504                                 uint64_t *zSigPtr)
2505 {
2506     int8_t shiftCount;
2507 
2508     shiftCount = countLeadingZeros64( aSig );
2509     *zSigPtr = aSig<<shiftCount;
2510     *zExpPtr = 1 - shiftCount;
2511 }
2512 
2513 /*----------------------------------------------------------------------------
2514 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2515 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2516 | and returns the proper extended double-precision floating-point value
2517 | corresponding to the abstract input.  Ordinarily, the abstract value is
2518 | rounded and packed into the extended double-precision format, with the
2519 | inexact exception raised if the abstract input cannot be represented
2520 | exactly.  However, if the abstract value is too large, the overflow and
2521 | inexact exceptions are raised and an infinity or maximal finite value is
2522 | returned.  If the abstract value is too small, the input value is rounded to
2523 | a subnormal number, and the underflow and inexact exceptions are raised if
2524 | the abstract input cannot be represented exactly as a subnormal extended
2525 | double-precision floating-point number.
2526 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2527 | number of bits as single or double precision, respectively.  Otherwise, the
2528 | result is rounded to the full precision of the extended double-precision
2529 | format.
2530 |     The input significand must be normalized or smaller.  If the input
2531 | significand is not normalized, `zExp' must be 0; in that case, the result
2532 | returned is a subnormal number, and it must not require rounding.  The
2533 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2534 | Floating-Point Arithmetic.
2535 *----------------------------------------------------------------------------*/
2536 
2537 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2538                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2539                               float_status *status)
2540 {
2541     int8_t roundingMode;
2542     flag roundNearestEven, increment, isTiny;
2543     int64_t roundIncrement, roundMask, roundBits;
2544 
2545     roundingMode = status->float_rounding_mode;
2546     roundNearestEven = ( roundingMode == float_round_nearest_even );
2547     if ( roundingPrecision == 80 ) goto precision80;
2548     if ( roundingPrecision == 64 ) {
2549         roundIncrement = LIT64( 0x0000000000000400 );
2550         roundMask = LIT64( 0x00000000000007FF );
2551     }
2552     else if ( roundingPrecision == 32 ) {
2553         roundIncrement = LIT64( 0x0000008000000000 );
2554         roundMask = LIT64( 0x000000FFFFFFFFFF );
2555     }
2556     else {
2557         goto precision80;
2558     }
2559     zSig0 |= ( zSig1 != 0 );
2560     switch (roundingMode) {
2561     case float_round_nearest_even:
2562     case float_round_ties_away:
2563         break;
2564     case float_round_to_zero:
2565         roundIncrement = 0;
2566         break;
2567     case float_round_up:
2568         roundIncrement = zSign ? 0 : roundMask;
2569         break;
2570     case float_round_down:
2571         roundIncrement = zSign ? roundMask : 0;
2572         break;
2573     default:
2574         abort();
2575     }
2576     roundBits = zSig0 & roundMask;
2577     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2578         if (    ( 0x7FFE < zExp )
2579              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2580            ) {
2581             goto overflow;
2582         }
2583         if ( zExp <= 0 ) {
2584             if (status->flush_to_zero) {
2585                 float_raise(float_flag_output_denormal, status);
2586                 return packFloatx80(zSign, 0, 0);
2587             }
2588             isTiny =
2589                    (status->float_detect_tininess
2590                     == float_tininess_before_rounding)
2591                 || ( zExp < 0 )
2592                 || ( zSig0 <= zSig0 + roundIncrement );
2593             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2594             zExp = 0;
2595             roundBits = zSig0 & roundMask;
2596             if (isTiny && roundBits) {
2597                 float_raise(float_flag_underflow, status);
2598             }
2599             if (roundBits) {
2600                 status->float_exception_flags |= float_flag_inexact;
2601             }
2602             zSig0 += roundIncrement;
2603             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2604             roundIncrement = roundMask + 1;
2605             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2606                 roundMask |= roundIncrement;
2607             }
2608             zSig0 &= ~ roundMask;
2609             return packFloatx80( zSign, zExp, zSig0 );
2610         }
2611     }
2612     if (roundBits) {
2613         status->float_exception_flags |= float_flag_inexact;
2614     }
2615     zSig0 += roundIncrement;
2616     if ( zSig0 < roundIncrement ) {
2617         ++zExp;
2618         zSig0 = LIT64( 0x8000000000000000 );
2619     }
2620     roundIncrement = roundMask + 1;
2621     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2622         roundMask |= roundIncrement;
2623     }
2624     zSig0 &= ~ roundMask;
2625     if ( zSig0 == 0 ) zExp = 0;
2626     return packFloatx80( zSign, zExp, zSig0 );
2627  precision80:
2628     switch (roundingMode) {
2629     case float_round_nearest_even:
2630     case float_round_ties_away:
2631         increment = ((int64_t)zSig1 < 0);
2632         break;
2633     case float_round_to_zero:
2634         increment = 0;
2635         break;
2636     case float_round_up:
2637         increment = !zSign && zSig1;
2638         break;
2639     case float_round_down:
2640         increment = zSign && zSig1;
2641         break;
2642     default:
2643         abort();
2644     }
2645     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2646         if (    ( 0x7FFE < zExp )
2647              || (    ( zExp == 0x7FFE )
2648                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2649                   && increment
2650                 )
2651            ) {
2652             roundMask = 0;
2653  overflow:
2654             float_raise(float_flag_overflow | float_flag_inexact, status);
2655             if (    ( roundingMode == float_round_to_zero )
2656                  || ( zSign && ( roundingMode == float_round_up ) )
2657                  || ( ! zSign && ( roundingMode == float_round_down ) )
2658                ) {
2659                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2660             }
2661             return packFloatx80(zSign,
2662                                 floatx80_infinity_high,
2663                                 floatx80_infinity_low);
2664         }
2665         if ( zExp <= 0 ) {
2666             isTiny =
2667                    (status->float_detect_tininess
2668                     == float_tininess_before_rounding)
2669                 || ( zExp < 0 )
2670                 || ! increment
2671                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2672             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2673             zExp = 0;
2674             if (isTiny && zSig1) {
2675                 float_raise(float_flag_underflow, status);
2676             }
2677             if (zSig1) {
2678                 status->float_exception_flags |= float_flag_inexact;
2679             }
2680             switch (roundingMode) {
2681             case float_round_nearest_even:
2682             case float_round_ties_away:
2683                 increment = ((int64_t)zSig1 < 0);
2684                 break;
2685             case float_round_to_zero:
2686                 increment = 0;
2687                 break;
2688             case float_round_up:
2689                 increment = !zSign && zSig1;
2690                 break;
2691             case float_round_down:
2692                 increment = zSign && zSig1;
2693                 break;
2694             default:
2695                 abort();
2696             }
2697             if ( increment ) {
2698                 ++zSig0;
2699                 zSig0 &=
2700                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2701                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2702             }
2703             return packFloatx80( zSign, zExp, zSig0 );
2704         }
2705     }
2706     if (zSig1) {
2707         status->float_exception_flags |= float_flag_inexact;
2708     }
2709     if ( increment ) {
2710         ++zSig0;
2711         if ( zSig0 == 0 ) {
2712             ++zExp;
2713             zSig0 = LIT64( 0x8000000000000000 );
2714         }
2715         else {
2716             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2717         }
2718     }
2719     else {
2720         if ( zSig0 == 0 ) zExp = 0;
2721     }
2722     return packFloatx80( zSign, zExp, zSig0 );
2723 
2724 }
2725 
2726 /*----------------------------------------------------------------------------
2727 | Takes an abstract floating-point value having sign `zSign', exponent
2728 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2729 | and returns the proper extended double-precision floating-point value
2730 | corresponding to the abstract input.  This routine is just like
2731 | `roundAndPackFloatx80' except that the input significand does not have to be
2732 | normalized.
2733 *----------------------------------------------------------------------------*/
2734 
2735 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2736                                        flag zSign, int32_t zExp,
2737                                        uint64_t zSig0, uint64_t zSig1,
2738                                        float_status *status)
2739 {
2740     int8_t shiftCount;
2741 
2742     if ( zSig0 == 0 ) {
2743         zSig0 = zSig1;
2744         zSig1 = 0;
2745         zExp -= 64;
2746     }
2747     shiftCount = countLeadingZeros64( zSig0 );
2748     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2749     zExp -= shiftCount;
2750     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2751                                 zSig0, zSig1, status);
2752 
2753 }
2754 
2755 /*----------------------------------------------------------------------------
2756 | Returns the least-significant 64 fraction bits of the quadruple-precision
2757 | floating-point value `a'.
2758 *----------------------------------------------------------------------------*/
2759 
2760 static inline uint64_t extractFloat128Frac1( float128 a )
2761 {
2762 
2763     return a.low;
2764 
2765 }
2766 
2767 /*----------------------------------------------------------------------------
2768 | Returns the most-significant 48 fraction bits of the quadruple-precision
2769 | floating-point value `a'.
2770 *----------------------------------------------------------------------------*/
2771 
2772 static inline uint64_t extractFloat128Frac0( float128 a )
2773 {
2774 
2775     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2776 
2777 }
2778 
2779 /*----------------------------------------------------------------------------
2780 | Returns the exponent bits of the quadruple-precision floating-point value
2781 | `a'.
2782 *----------------------------------------------------------------------------*/
2783 
2784 static inline int32_t extractFloat128Exp( float128 a )
2785 {
2786 
2787     return ( a.high>>48 ) & 0x7FFF;
2788 
2789 }
2790 
2791 /*----------------------------------------------------------------------------
2792 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2793 *----------------------------------------------------------------------------*/
2794 
2795 static inline flag extractFloat128Sign( float128 a )
2796 {
2797 
2798     return a.high>>63;
2799 
2800 }
2801 
2802 /*----------------------------------------------------------------------------
2803 | Normalizes the subnormal quadruple-precision floating-point value
2804 | represented by the denormalized significand formed by the concatenation of
2805 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2806 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2807 | significand are stored at the location pointed to by `zSig0Ptr', and the
2808 | least significant 64 bits of the normalized significand are stored at the
2809 | location pointed to by `zSig1Ptr'.
2810 *----------------------------------------------------------------------------*/
2811 
2812 static void
2813  normalizeFloat128Subnormal(
2814      uint64_t aSig0,
2815      uint64_t aSig1,
2816      int32_t *zExpPtr,
2817      uint64_t *zSig0Ptr,
2818      uint64_t *zSig1Ptr
2819  )
2820 {
2821     int8_t shiftCount;
2822 
2823     if ( aSig0 == 0 ) {
2824         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2825         if ( shiftCount < 0 ) {
2826             *zSig0Ptr = aSig1>>( - shiftCount );
2827             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2828         }
2829         else {
2830             *zSig0Ptr = aSig1<<shiftCount;
2831             *zSig1Ptr = 0;
2832         }
2833         *zExpPtr = - shiftCount - 63;
2834     }
2835     else {
2836         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2837         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2838         *zExpPtr = 1 - shiftCount;
2839     }
2840 
2841 }
2842 
2843 /*----------------------------------------------------------------------------
2844 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2845 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2846 | floating-point value, returning the result.  After being shifted into the
2847 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2848 | added together to form the most significant 32 bits of the result.  This
2849 | means that any integer portion of `zSig0' will be added into the exponent.
2850 | Since a properly normalized significand will have an integer portion equal
2851 | to 1, the `zExp' input should be 1 less than the desired result exponent
2852 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
2853 | significand.
2854 *----------------------------------------------------------------------------*/
2855 
2856 static inline float128
2857  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
2858 {
2859     float128 z;
2860 
2861     z.low = zSig1;
2862     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
2863     return z;
2864 
2865 }
2866 
2867 /*----------------------------------------------------------------------------
2868 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2869 | and extended significand formed by the concatenation of `zSig0', `zSig1',
2870 | and `zSig2', and returns the proper quadruple-precision floating-point value
2871 | corresponding to the abstract input.  Ordinarily, the abstract value is
2872 | simply rounded and packed into the quadruple-precision format, with the
2873 | inexact exception raised if the abstract input cannot be represented
2874 | exactly.  However, if the abstract value is too large, the overflow and
2875 | inexact exceptions are raised and an infinity or maximal finite value is
2876 | returned.  If the abstract value is too small, the input value is rounded to
2877 | a subnormal number, and the underflow and inexact exceptions are raised if
2878 | the abstract input cannot be represented exactly as a subnormal quadruple-
2879 | precision floating-point number.
2880 |     The input significand must be normalized or smaller.  If the input
2881 | significand is not normalized, `zExp' must be 0; in that case, the result
2882 | returned is a subnormal number, and it must not require rounding.  In the
2883 | usual case that the input significand is normalized, `zExp' must be 1 less
2884 | than the ``true'' floating-point exponent.  The handling of underflow and
2885 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2886 *----------------------------------------------------------------------------*/
2887 
2888 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
2889                                      uint64_t zSig0, uint64_t zSig1,
2890                                      uint64_t zSig2, float_status *status)
2891 {
2892     int8_t roundingMode;
2893     flag roundNearestEven, increment, isTiny;
2894 
2895     roundingMode = status->float_rounding_mode;
2896     roundNearestEven = ( roundingMode == float_round_nearest_even );
2897     switch (roundingMode) {
2898     case float_round_nearest_even:
2899     case float_round_ties_away:
2900         increment = ((int64_t)zSig2 < 0);
2901         break;
2902     case float_round_to_zero:
2903         increment = 0;
2904         break;
2905     case float_round_up:
2906         increment = !zSign && zSig2;
2907         break;
2908     case float_round_down:
2909         increment = zSign && zSig2;
2910         break;
2911     case float_round_to_odd:
2912         increment = !(zSig1 & 0x1) && zSig2;
2913         break;
2914     default:
2915         abort();
2916     }
2917     if ( 0x7FFD <= (uint32_t) zExp ) {
2918         if (    ( 0x7FFD < zExp )
2919              || (    ( zExp == 0x7FFD )
2920                   && eq128(
2921                          LIT64( 0x0001FFFFFFFFFFFF ),
2922                          LIT64( 0xFFFFFFFFFFFFFFFF ),
2923                          zSig0,
2924                          zSig1
2925                      )
2926                   && increment
2927                 )
2928            ) {
2929             float_raise(float_flag_overflow | float_flag_inexact, status);
2930             if (    ( roundingMode == float_round_to_zero )
2931                  || ( zSign && ( roundingMode == float_round_up ) )
2932                  || ( ! zSign && ( roundingMode == float_round_down ) )
2933                  || (roundingMode == float_round_to_odd)
2934                ) {
2935                 return
2936                     packFloat128(
2937                         zSign,
2938                         0x7FFE,
2939                         LIT64( 0x0000FFFFFFFFFFFF ),
2940                         LIT64( 0xFFFFFFFFFFFFFFFF )
2941                     );
2942             }
2943             return packFloat128( zSign, 0x7FFF, 0, 0 );
2944         }
2945         if ( zExp < 0 ) {
2946             if (status->flush_to_zero) {
2947                 float_raise(float_flag_output_denormal, status);
2948                 return packFloat128(zSign, 0, 0, 0);
2949             }
2950             isTiny =
2951                    (status->float_detect_tininess
2952                     == float_tininess_before_rounding)
2953                 || ( zExp < -1 )
2954                 || ! increment
2955                 || lt128(
2956                        zSig0,
2957                        zSig1,
2958                        LIT64( 0x0001FFFFFFFFFFFF ),
2959                        LIT64( 0xFFFFFFFFFFFFFFFF )
2960                    );
2961             shift128ExtraRightJamming(
2962                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
2963             zExp = 0;
2964             if (isTiny && zSig2) {
2965                 float_raise(float_flag_underflow, status);
2966             }
2967             switch (roundingMode) {
2968             case float_round_nearest_even:
2969             case float_round_ties_away:
2970                 increment = ((int64_t)zSig2 < 0);
2971                 break;
2972             case float_round_to_zero:
2973                 increment = 0;
2974                 break;
2975             case float_round_up:
2976                 increment = !zSign && zSig2;
2977                 break;
2978             case float_round_down:
2979                 increment = zSign && zSig2;
2980                 break;
2981             case float_round_to_odd:
2982                 increment = !(zSig1 & 0x1) && zSig2;
2983                 break;
2984             default:
2985                 abort();
2986             }
2987         }
2988     }
2989     if (zSig2) {
2990         status->float_exception_flags |= float_flag_inexact;
2991     }
2992     if ( increment ) {
2993         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
2994         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
2995     }
2996     else {
2997         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
2998     }
2999     return packFloat128( zSign, zExp, zSig0, zSig1 );
3000 
3001 }
3002 
3003 /*----------------------------------------------------------------------------
3004 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3005 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3006 | returns the proper quadruple-precision floating-point value corresponding
3007 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3008 | except that the input significand has fewer bits and does not have to be
3009 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3010 | point exponent.
3011 *----------------------------------------------------------------------------*/
3012 
3013 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3014                                               uint64_t zSig0, uint64_t zSig1,
3015                                               float_status *status)
3016 {
3017     int8_t shiftCount;
3018     uint64_t zSig2;
3019 
3020     if ( zSig0 == 0 ) {
3021         zSig0 = zSig1;
3022         zSig1 = 0;
3023         zExp -= 64;
3024     }
3025     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3026     if ( 0 <= shiftCount ) {
3027         zSig2 = 0;
3028         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3029     }
3030     else {
3031         shift128ExtraRightJamming(
3032             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3033     }
3034     zExp -= shiftCount;
3035     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3036 
3037 }
3038 
3039 
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the 32-bit two's complement integer `a'
3042 | to the extended double-precision floating-point format.  The conversion
3043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3044 | Arithmetic.
3045 *----------------------------------------------------------------------------*/
3046 
3047 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3048 {
3049     flag zSign;
3050     uint32_t absA;
3051     int8_t shiftCount;
3052     uint64_t zSig;
3053 
3054     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3055     zSign = ( a < 0 );
3056     absA = zSign ? - a : a;
3057     shiftCount = countLeadingZeros32( absA ) + 32;
3058     zSig = absA;
3059     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3060 
3061 }
3062 
3063 /*----------------------------------------------------------------------------
3064 | Returns the result of converting the 32-bit two's complement integer `a' to
3065 | the quadruple-precision floating-point format.  The conversion is performed
3066 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3067 *----------------------------------------------------------------------------*/
3068 
3069 float128 int32_to_float128(int32_t a, float_status *status)
3070 {
3071     flag zSign;
3072     uint32_t absA;
3073     int8_t shiftCount;
3074     uint64_t zSig0;
3075 
3076     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3077     zSign = ( a < 0 );
3078     absA = zSign ? - a : a;
3079     shiftCount = countLeadingZeros32( absA ) + 17;
3080     zSig0 = absA;
3081     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3082 
3083 }
3084 
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of converting the 64-bit two's complement integer `a'
3087 | to the extended double-precision floating-point format.  The conversion
3088 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3089 | Arithmetic.
3090 *----------------------------------------------------------------------------*/
3091 
3092 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3093 {
3094     flag zSign;
3095     uint64_t absA;
3096     int8_t shiftCount;
3097 
3098     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3099     zSign = ( a < 0 );
3100     absA = zSign ? - a : a;
3101     shiftCount = countLeadingZeros64( absA );
3102     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3103 
3104 }
3105 
3106 /*----------------------------------------------------------------------------
3107 | Returns the result of converting the 64-bit two's complement integer `a' to
3108 | the quadruple-precision floating-point format.  The conversion is performed
3109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3110 *----------------------------------------------------------------------------*/
3111 
3112 float128 int64_to_float128(int64_t a, float_status *status)
3113 {
3114     flag zSign;
3115     uint64_t absA;
3116     int8_t shiftCount;
3117     int32_t zExp;
3118     uint64_t zSig0, zSig1;
3119 
3120     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3121     zSign = ( a < 0 );
3122     absA = zSign ? - a : a;
3123     shiftCount = countLeadingZeros64( absA ) + 49;
3124     zExp = 0x406E - shiftCount;
3125     if ( 64 <= shiftCount ) {
3126         zSig1 = 0;
3127         zSig0 = absA;
3128         shiftCount -= 64;
3129     }
3130     else {
3131         zSig1 = absA;
3132         zSig0 = 0;
3133     }
3134     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3135     return packFloat128( zSign, zExp, zSig0, zSig1 );
3136 
3137 }
3138 
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of converting the 64-bit unsigned integer `a'
3141 | to the quadruple-precision floating-point format.  The conversion is performed
3142 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3143 *----------------------------------------------------------------------------*/
3144 
3145 float128 uint64_to_float128(uint64_t a, float_status *status)
3146 {
3147     if (a == 0) {
3148         return float128_zero;
3149     }
3150     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
3151 }
3152 
3153 
3154 
3155 
3156 /*----------------------------------------------------------------------------
3157 | Returns the result of converting the single-precision floating-point value
3158 | `a' to the double-precision floating-point format.  The conversion is
3159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3160 | Arithmetic.
3161 *----------------------------------------------------------------------------*/
3162 
3163 float64 float32_to_float64(float32 a, float_status *status)
3164 {
3165     flag aSign;
3166     int aExp;
3167     uint32_t aSig;
3168     a = float32_squash_input_denormal(a, status);
3169 
3170     aSig = extractFloat32Frac( a );
3171     aExp = extractFloat32Exp( a );
3172     aSign = extractFloat32Sign( a );
3173     if ( aExp == 0xFF ) {
3174         if (aSig) {
3175             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
3176         }
3177         return packFloat64( aSign, 0x7FF, 0 );
3178     }
3179     if ( aExp == 0 ) {
3180         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
3181         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3182         --aExp;
3183     }
3184     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
3185 
3186 }
3187 
3188 /*----------------------------------------------------------------------------
3189 | Returns the result of converting the single-precision floating-point value
3190 | `a' to the extended double-precision floating-point format.  The conversion
3191 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3192 | Arithmetic.
3193 *----------------------------------------------------------------------------*/
3194 
3195 floatx80 float32_to_floatx80(float32 a, float_status *status)
3196 {
3197     flag aSign;
3198     int aExp;
3199     uint32_t aSig;
3200 
3201     a = float32_squash_input_denormal(a, status);
3202     aSig = extractFloat32Frac( a );
3203     aExp = extractFloat32Exp( a );
3204     aSign = extractFloat32Sign( a );
3205     if ( aExp == 0xFF ) {
3206         if (aSig) {
3207             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3208         }
3209         return packFloatx80(aSign,
3210                             floatx80_infinity_high,
3211                             floatx80_infinity_low);
3212     }
3213     if ( aExp == 0 ) {
3214         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3215         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3216     }
3217     aSig |= 0x00800000;
3218     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3219 
3220 }
3221 
3222 /*----------------------------------------------------------------------------
3223 | Returns the result of converting the single-precision floating-point value
3224 | `a' to the double-precision floating-point format.  The conversion is
3225 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3226 | Arithmetic.
3227 *----------------------------------------------------------------------------*/
3228 
3229 float128 float32_to_float128(float32 a, float_status *status)
3230 {
3231     flag aSign;
3232     int aExp;
3233     uint32_t aSig;
3234 
3235     a = float32_squash_input_denormal(a, status);
3236     aSig = extractFloat32Frac( a );
3237     aExp = extractFloat32Exp( a );
3238     aSign = extractFloat32Sign( a );
3239     if ( aExp == 0xFF ) {
3240         if (aSig) {
3241             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3242         }
3243         return packFloat128( aSign, 0x7FFF, 0, 0 );
3244     }
3245     if ( aExp == 0 ) {
3246         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3247         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3248         --aExp;
3249     }
3250     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3251 
3252 }
3253 
3254 /*----------------------------------------------------------------------------
3255 | Returns the remainder of the single-precision floating-point value `a'
3256 | with respect to the corresponding value `b'.  The operation is performed
3257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3258 *----------------------------------------------------------------------------*/
3259 
3260 float32 float32_rem(float32 a, float32 b, float_status *status)
3261 {
3262     flag aSign, zSign;
3263     int aExp, bExp, expDiff;
3264     uint32_t aSig, bSig;
3265     uint32_t q;
3266     uint64_t aSig64, bSig64, q64;
3267     uint32_t alternateASig;
3268     int32_t sigMean;
3269     a = float32_squash_input_denormal(a, status);
3270     b = float32_squash_input_denormal(b, status);
3271 
3272     aSig = extractFloat32Frac( a );
3273     aExp = extractFloat32Exp( a );
3274     aSign = extractFloat32Sign( a );
3275     bSig = extractFloat32Frac( b );
3276     bExp = extractFloat32Exp( b );
3277     if ( aExp == 0xFF ) {
3278         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3279             return propagateFloat32NaN(a, b, status);
3280         }
3281         float_raise(float_flag_invalid, status);
3282         return float32_default_nan(status);
3283     }
3284     if ( bExp == 0xFF ) {
3285         if (bSig) {
3286             return propagateFloat32NaN(a, b, status);
3287         }
3288         return a;
3289     }
3290     if ( bExp == 0 ) {
3291         if ( bSig == 0 ) {
3292             float_raise(float_flag_invalid, status);
3293             return float32_default_nan(status);
3294         }
3295         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3296     }
3297     if ( aExp == 0 ) {
3298         if ( aSig == 0 ) return a;
3299         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3300     }
3301     expDiff = aExp - bExp;
3302     aSig |= 0x00800000;
3303     bSig |= 0x00800000;
3304     if ( expDiff < 32 ) {
3305         aSig <<= 8;
3306         bSig <<= 8;
3307         if ( expDiff < 0 ) {
3308             if ( expDiff < -1 ) return a;
3309             aSig >>= 1;
3310         }
3311         q = ( bSig <= aSig );
3312         if ( q ) aSig -= bSig;
3313         if ( 0 < expDiff ) {
3314             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3315             q >>= 32 - expDiff;
3316             bSig >>= 2;
3317             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3318         }
3319         else {
3320             aSig >>= 2;
3321             bSig >>= 2;
3322         }
3323     }
3324     else {
3325         if ( bSig <= aSig ) aSig -= bSig;
3326         aSig64 = ( (uint64_t) aSig )<<40;
3327         bSig64 = ( (uint64_t) bSig )<<40;
3328         expDiff -= 64;
3329         while ( 0 < expDiff ) {
3330             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3331             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3332             aSig64 = - ( ( bSig * q64 )<<38 );
3333             expDiff -= 62;
3334         }
3335         expDiff += 64;
3336         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3337         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3338         q = q64>>( 64 - expDiff );
3339         bSig <<= 6;
3340         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3341     }
3342     do {
3343         alternateASig = aSig;
3344         ++q;
3345         aSig -= bSig;
3346     } while ( 0 <= (int32_t) aSig );
3347     sigMean = aSig + alternateASig;
3348     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3349         aSig = alternateASig;
3350     }
3351     zSign = ( (int32_t) aSig < 0 );
3352     if ( zSign ) aSig = - aSig;
3353     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3354 }
3355 
3356 
3357 
3358 /*----------------------------------------------------------------------------
3359 | Returns the binary exponential of the single-precision floating-point value
3360 | `a'. The operation is performed according to the IEC/IEEE Standard for
3361 | Binary Floating-Point Arithmetic.
3362 |
3363 | Uses the following identities:
3364 |
3365 | 1. -------------------------------------------------------------------------
3366 |      x    x*ln(2)
3367 |     2  = e
3368 |
3369 | 2. -------------------------------------------------------------------------
3370 |                      2     3     4     5           n
3371 |      x        x     x     x     x     x           x
3372 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3373 |               1!    2!    3!    4!    5!          n!
3374 *----------------------------------------------------------------------------*/
3375 
3376 static const float64 float32_exp2_coefficients[15] =
3377 {
3378     const_float64( 0x3ff0000000000000ll ), /*  1 */
3379     const_float64( 0x3fe0000000000000ll ), /*  2 */
3380     const_float64( 0x3fc5555555555555ll ), /*  3 */
3381     const_float64( 0x3fa5555555555555ll ), /*  4 */
3382     const_float64( 0x3f81111111111111ll ), /*  5 */
3383     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3384     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3385     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3386     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3387     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3388     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3389     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3390     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3391     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3392     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3393 };
3394 
3395 float32 float32_exp2(float32 a, float_status *status)
3396 {
3397     flag aSign;
3398     int aExp;
3399     uint32_t aSig;
3400     float64 r, x, xn;
3401     int i;
3402     a = float32_squash_input_denormal(a, status);
3403 
3404     aSig = extractFloat32Frac( a );
3405     aExp = extractFloat32Exp( a );
3406     aSign = extractFloat32Sign( a );
3407 
3408     if ( aExp == 0xFF) {
3409         if (aSig) {
3410             return propagateFloat32NaN(a, float32_zero, status);
3411         }
3412         return (aSign) ? float32_zero : a;
3413     }
3414     if (aExp == 0) {
3415         if (aSig == 0) return float32_one;
3416     }
3417 
3418     float_raise(float_flag_inexact, status);
3419 
3420     /* ******************************* */
3421     /* using float64 for approximation */
3422     /* ******************************* */
3423     x = float32_to_float64(a, status);
3424     x = float64_mul(x, float64_ln2, status);
3425 
3426     xn = x;
3427     r = float64_one;
3428     for (i = 0 ; i < 15 ; i++) {
3429         float64 f;
3430 
3431         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3432         r = float64_add(r, f, status);
3433 
3434         xn = float64_mul(xn, x, status);
3435     }
3436 
3437     return float64_to_float32(r, status);
3438 }
3439 
3440 /*----------------------------------------------------------------------------
3441 | Returns the binary log of the single-precision floating-point value `a'.
3442 | The operation is performed according to the IEC/IEEE Standard for Binary
3443 | Floating-Point Arithmetic.
3444 *----------------------------------------------------------------------------*/
3445 float32 float32_log2(float32 a, float_status *status)
3446 {
3447     flag aSign, zSign;
3448     int aExp;
3449     uint32_t aSig, zSig, i;
3450 
3451     a = float32_squash_input_denormal(a, status);
3452     aSig = extractFloat32Frac( a );
3453     aExp = extractFloat32Exp( a );
3454     aSign = extractFloat32Sign( a );
3455 
3456     if ( aExp == 0 ) {
3457         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3458         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3459     }
3460     if ( aSign ) {
3461         float_raise(float_flag_invalid, status);
3462         return float32_default_nan(status);
3463     }
3464     if ( aExp == 0xFF ) {
3465         if (aSig) {
3466             return propagateFloat32NaN(a, float32_zero, status);
3467         }
3468         return a;
3469     }
3470 
3471     aExp -= 0x7F;
3472     aSig |= 0x00800000;
3473     zSign = aExp < 0;
3474     zSig = aExp << 23;
3475 
3476     for (i = 1 << 22; i > 0; i >>= 1) {
3477         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3478         if ( aSig & 0x01000000 ) {
3479             aSig >>= 1;
3480             zSig |= i;
3481         }
3482     }
3483 
3484     if ( zSign )
3485         zSig = -zSig;
3486 
3487     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3488 }
3489 
3490 /*----------------------------------------------------------------------------
3491 | Returns 1 if the single-precision floating-point value `a' is equal to
3492 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3493 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3494 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3495 *----------------------------------------------------------------------------*/
3496 
3497 int float32_eq(float32 a, float32 b, float_status *status)
3498 {
3499     uint32_t av, bv;
3500     a = float32_squash_input_denormal(a, status);
3501     b = float32_squash_input_denormal(b, status);
3502 
3503     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3504          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3505        ) {
3506         float_raise(float_flag_invalid, status);
3507         return 0;
3508     }
3509     av = float32_val(a);
3510     bv = float32_val(b);
3511     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3512 }
3513 
3514 /*----------------------------------------------------------------------------
3515 | Returns 1 if the single-precision floating-point value `a' is less than
3516 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3517 | exception is raised if either operand is a NaN.  The comparison is performed
3518 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3519 *----------------------------------------------------------------------------*/
3520 
3521 int float32_le(float32 a, float32 b, float_status *status)
3522 {
3523     flag aSign, bSign;
3524     uint32_t av, bv;
3525     a = float32_squash_input_denormal(a, status);
3526     b = float32_squash_input_denormal(b, status);
3527 
3528     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3529          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3530        ) {
3531         float_raise(float_flag_invalid, status);
3532         return 0;
3533     }
3534     aSign = extractFloat32Sign( a );
3535     bSign = extractFloat32Sign( b );
3536     av = float32_val(a);
3537     bv = float32_val(b);
3538     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3539     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3540 
3541 }
3542 
3543 /*----------------------------------------------------------------------------
3544 | Returns 1 if the single-precision floating-point value `a' is less than
3545 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3546 | raised if either operand is a NaN.  The comparison is performed according
3547 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3548 *----------------------------------------------------------------------------*/
3549 
3550 int float32_lt(float32 a, float32 b, float_status *status)
3551 {
3552     flag aSign, bSign;
3553     uint32_t av, bv;
3554     a = float32_squash_input_denormal(a, status);
3555     b = float32_squash_input_denormal(b, status);
3556 
3557     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3558          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3559        ) {
3560         float_raise(float_flag_invalid, status);
3561         return 0;
3562     }
3563     aSign = extractFloat32Sign( a );
3564     bSign = extractFloat32Sign( b );
3565     av = float32_val(a);
3566     bv = float32_val(b);
3567     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3568     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3569 
3570 }
3571 
3572 /*----------------------------------------------------------------------------
3573 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3574 | be compared, and 0 otherwise.  The invalid exception is raised if either
3575 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3576 | Standard for Binary Floating-Point Arithmetic.
3577 *----------------------------------------------------------------------------*/
3578 
3579 int float32_unordered(float32 a, float32 b, float_status *status)
3580 {
3581     a = float32_squash_input_denormal(a, status);
3582     b = float32_squash_input_denormal(b, status);
3583 
3584     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3585          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3586        ) {
3587         float_raise(float_flag_invalid, status);
3588         return 1;
3589     }
3590     return 0;
3591 }
3592 
3593 /*----------------------------------------------------------------------------
3594 | Returns 1 if the single-precision floating-point value `a' is equal to
3595 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3596 | exception.  The comparison is performed according to the IEC/IEEE Standard
3597 | for Binary Floating-Point Arithmetic.
3598 *----------------------------------------------------------------------------*/
3599 
3600 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3601 {
3602     a = float32_squash_input_denormal(a, status);
3603     b = float32_squash_input_denormal(b, status);
3604 
3605     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3606          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3607        ) {
3608         if (float32_is_signaling_nan(a, status)
3609          || float32_is_signaling_nan(b, status)) {
3610             float_raise(float_flag_invalid, status);
3611         }
3612         return 0;
3613     }
3614     return ( float32_val(a) == float32_val(b) ) ||
3615             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3616 }
3617 
3618 /*----------------------------------------------------------------------------
3619 | Returns 1 if the single-precision floating-point value `a' is less than or
3620 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3621 | cause an exception.  Otherwise, the comparison is performed according to the
3622 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3623 *----------------------------------------------------------------------------*/
3624 
3625 int float32_le_quiet(float32 a, float32 b, float_status *status)
3626 {
3627     flag aSign, bSign;
3628     uint32_t av, bv;
3629     a = float32_squash_input_denormal(a, status);
3630     b = float32_squash_input_denormal(b, status);
3631 
3632     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3633          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3634        ) {
3635         if (float32_is_signaling_nan(a, status)
3636          || float32_is_signaling_nan(b, status)) {
3637             float_raise(float_flag_invalid, status);
3638         }
3639         return 0;
3640     }
3641     aSign = extractFloat32Sign( a );
3642     bSign = extractFloat32Sign( b );
3643     av = float32_val(a);
3644     bv = float32_val(b);
3645     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3646     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3647 
3648 }
3649 
3650 /*----------------------------------------------------------------------------
3651 | Returns 1 if the single-precision floating-point value `a' is less than
3652 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3653 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3654 | Standard for Binary Floating-Point Arithmetic.
3655 *----------------------------------------------------------------------------*/
3656 
3657 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3658 {
3659     flag aSign, bSign;
3660     uint32_t av, bv;
3661     a = float32_squash_input_denormal(a, status);
3662     b = float32_squash_input_denormal(b, status);
3663 
3664     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3665          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3666        ) {
3667         if (float32_is_signaling_nan(a, status)
3668          || float32_is_signaling_nan(b, status)) {
3669             float_raise(float_flag_invalid, status);
3670         }
3671         return 0;
3672     }
3673     aSign = extractFloat32Sign( a );
3674     bSign = extractFloat32Sign( b );
3675     av = float32_val(a);
3676     bv = float32_val(b);
3677     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3678     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3679 
3680 }
3681 
3682 /*----------------------------------------------------------------------------
3683 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3684 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3685 | comparison is performed according to the IEC/IEEE Standard for Binary
3686 | Floating-Point Arithmetic.
3687 *----------------------------------------------------------------------------*/
3688 
3689 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3690 {
3691     a = float32_squash_input_denormal(a, status);
3692     b = float32_squash_input_denormal(b, status);
3693 
3694     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3695          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3696        ) {
3697         if (float32_is_signaling_nan(a, status)
3698          || float32_is_signaling_nan(b, status)) {
3699             float_raise(float_flag_invalid, status);
3700         }
3701         return 1;
3702     }
3703     return 0;
3704 }
3705 
3706 
3707 /*----------------------------------------------------------------------------
3708 | Returns the result of converting the double-precision floating-point value
3709 | `a' to the single-precision floating-point format.  The conversion is
3710 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3711 | Arithmetic.
3712 *----------------------------------------------------------------------------*/
3713 
3714 float32 float64_to_float32(float64 a, float_status *status)
3715 {
3716     flag aSign;
3717     int aExp;
3718     uint64_t aSig;
3719     uint32_t zSig;
3720     a = float64_squash_input_denormal(a, status);
3721 
3722     aSig = extractFloat64Frac( a );
3723     aExp = extractFloat64Exp( a );
3724     aSign = extractFloat64Sign( a );
3725     if ( aExp == 0x7FF ) {
3726         if (aSig) {
3727             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3728         }
3729         return packFloat32( aSign, 0xFF, 0 );
3730     }
3731     shift64RightJamming( aSig, 22, &aSig );
3732     zSig = aSig;
3733     if ( aExp || zSig ) {
3734         zSig |= 0x40000000;
3735         aExp -= 0x381;
3736     }
3737     return roundAndPackFloat32(aSign, aExp, zSig, status);
3738 
3739 }
3740 
3741 
3742 /*----------------------------------------------------------------------------
3743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3744 | half-precision floating-point value, returning the result.  After being
3745 | shifted into the proper positions, the three fields are simply added
3746 | together to form the result.  This means that any integer portion of `zSig'
3747 | will be added into the exponent.  Since a properly normalized significand
3748 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3749 | than the desired result exponent whenever `zSig' is a complete, normalized
3750 | significand.
3751 *----------------------------------------------------------------------------*/
3752 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3753 {
3754     return make_float16(
3755         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3756 }
3757 
3758 /*----------------------------------------------------------------------------
3759 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3760 | and significand `zSig', and returns the proper half-precision floating-
3761 | point value corresponding to the abstract input.  Ordinarily, the abstract
3762 | value is simply rounded and packed into the half-precision format, with
3763 | the inexact exception raised if the abstract input cannot be represented
3764 | exactly.  However, if the abstract value is too large, the overflow and
3765 | inexact exceptions are raised and an infinity or maximal finite value is
3766 | returned.  If the abstract value is too small, the input value is rounded to
3767 | a subnormal number, and the underflow and inexact exceptions are raised if
3768 | the abstract input cannot be represented exactly as a subnormal half-
3769 | precision floating-point number.
3770 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3771 | ARM-style "alternative representation", which omits the NaN and Inf
3772 | encodings in order to raise the maximum representable exponent by one.
3773 |     The input significand `zSig' has its binary point between bits 22
3774 | and 23, which is 13 bits to the left of the usual location.  This shifted
3775 | significand must be normalized or smaller.  If `zSig' is not normalized,
3776 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3777 | and it must not require rounding.  In the usual case that `zSig' is
3778 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3779 | Note the slightly odd position of the binary point in zSig compared with the
3780 | other roundAndPackFloat functions. This should probably be fixed if we
3781 | need to implement more float16 routines than just conversion.
3782 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3783 | Binary Floating-Point Arithmetic.
3784 *----------------------------------------------------------------------------*/
3785 
3786 static float16 roundAndPackFloat16(flag zSign, int zExp,
3787                                    uint32_t zSig, flag ieee,
3788                                    float_status *status)
3789 {
3790     int maxexp = ieee ? 29 : 30;
3791     uint32_t mask;
3792     uint32_t increment;
3793     bool rounding_bumps_exp;
3794     bool is_tiny = false;
3795 
3796     /* Calculate the mask of bits of the mantissa which are not
3797      * representable in half-precision and will be lost.
3798      */
3799     if (zExp < 1) {
3800         /* Will be denormal in halfprec */
3801         mask = 0x00ffffff;
3802         if (zExp >= -11) {
3803             mask >>= 11 + zExp;
3804         }
3805     } else {
3806         /* Normal number in halfprec */
3807         mask = 0x00001fff;
3808     }
3809 
3810     switch (status->float_rounding_mode) {
3811     case float_round_nearest_even:
3812         increment = (mask + 1) >> 1;
3813         if ((zSig & mask) == increment) {
3814             increment = zSig & (increment << 1);
3815         }
3816         break;
3817     case float_round_ties_away:
3818         increment = (mask + 1) >> 1;
3819         break;
3820     case float_round_up:
3821         increment = zSign ? 0 : mask;
3822         break;
3823     case float_round_down:
3824         increment = zSign ? mask : 0;
3825         break;
3826     default: /* round_to_zero */
3827         increment = 0;
3828         break;
3829     }
3830 
3831     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3832 
3833     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3834         if (ieee) {
3835             float_raise(float_flag_overflow | float_flag_inexact, status);
3836             return packFloat16(zSign, 0x1f, 0);
3837         } else {
3838             float_raise(float_flag_invalid, status);
3839             return packFloat16(zSign, 0x1f, 0x3ff);
3840         }
3841     }
3842 
3843     if (zExp < 0) {
3844         /* Note that flush-to-zero does not affect half-precision results */
3845         is_tiny =
3846             (status->float_detect_tininess == float_tininess_before_rounding)
3847             || (zExp < -1)
3848             || (!rounding_bumps_exp);
3849     }
3850     if (zSig & mask) {
3851         float_raise(float_flag_inexact, status);
3852         if (is_tiny) {
3853             float_raise(float_flag_underflow, status);
3854         }
3855     }
3856 
3857     zSig += increment;
3858     if (rounding_bumps_exp) {
3859         zSig >>= 1;
3860         zExp++;
3861     }
3862 
3863     if (zExp < -10) {
3864         return packFloat16(zSign, 0, 0);
3865     }
3866     if (zExp < 0) {
3867         zSig >>= -zExp;
3868         zExp = 0;
3869     }
3870     return packFloat16(zSign, zExp, zSig >> 13);
3871 }
3872 
3873 /*----------------------------------------------------------------------------
3874 | If `a' is denormal and we are in flush-to-zero mode then set the
3875 | input-denormal exception and return zero. Otherwise just return the value.
3876 *----------------------------------------------------------------------------*/
3877 float16 float16_squash_input_denormal(float16 a, float_status *status)
3878 {
3879     if (status->flush_inputs_to_zero) {
3880         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3881             float_raise(float_flag_input_denormal, status);
3882             return make_float16(float16_val(a) & 0x8000);
3883         }
3884     }
3885     return a;
3886 }
3887 
3888 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3889                                       uint32_t *zSigPtr)
3890 {
3891     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3892     *zSigPtr = aSig << shiftCount;
3893     *zExpPtr = 1 - shiftCount;
3894 }
3895 
3896 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3897    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3898 
3899 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3900 {
3901     flag aSign;
3902     int aExp;
3903     uint32_t aSig;
3904 
3905     aSign = extractFloat16Sign(a);
3906     aExp = extractFloat16Exp(a);
3907     aSig = extractFloat16Frac(a);
3908 
3909     if (aExp == 0x1f && ieee) {
3910         if (aSig) {
3911             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3912         }
3913         return packFloat32(aSign, 0xff, 0);
3914     }
3915     if (aExp == 0) {
3916         if (aSig == 0) {
3917             return packFloat32(aSign, 0, 0);
3918         }
3919 
3920         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3921         aExp--;
3922     }
3923     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3924 }
3925 
3926 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3927 {
3928     flag aSign;
3929     int aExp;
3930     uint32_t aSig;
3931 
3932     a = float32_squash_input_denormal(a, status);
3933 
3934     aSig = extractFloat32Frac( a );
3935     aExp = extractFloat32Exp( a );
3936     aSign = extractFloat32Sign( a );
3937     if ( aExp == 0xFF ) {
3938         if (aSig) {
3939             /* Input is a NaN */
3940             if (!ieee) {
3941                 float_raise(float_flag_invalid, status);
3942                 return packFloat16(aSign, 0, 0);
3943             }
3944             return commonNaNToFloat16(
3945                 float32ToCommonNaN(a, status), status);
3946         }
3947         /* Infinity */
3948         if (!ieee) {
3949             float_raise(float_flag_invalid, status);
3950             return packFloat16(aSign, 0x1f, 0x3ff);
3951         }
3952         return packFloat16(aSign, 0x1f, 0);
3953     }
3954     if (aExp == 0 && aSig == 0) {
3955         return packFloat16(aSign, 0, 0);
3956     }
3957     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3958      * even if the input is denormal; however this is harmless because
3959      * the largest possible single-precision denormal is still smaller
3960      * than the smallest representable half-precision denormal, and so we
3961      * will end up ignoring aSig and returning via the "always return zero"
3962      * codepath.
3963      */
3964     aSig |= 0x00800000;
3965     aExp -= 0x71;
3966 
3967     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3968 }
3969 
3970 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3971 {
3972     flag aSign;
3973     int aExp;
3974     uint32_t aSig;
3975 
3976     aSign = extractFloat16Sign(a);
3977     aExp = extractFloat16Exp(a);
3978     aSig = extractFloat16Frac(a);
3979 
3980     if (aExp == 0x1f && ieee) {
3981         if (aSig) {
3982             return commonNaNToFloat64(
3983                 float16ToCommonNaN(a, status), status);
3984         }
3985         return packFloat64(aSign, 0x7ff, 0);
3986     }
3987     if (aExp == 0) {
3988         if (aSig == 0) {
3989             return packFloat64(aSign, 0, 0);
3990         }
3991 
3992         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3993         aExp--;
3994     }
3995     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3996 }
3997 
3998 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3999 {
4000     flag aSign;
4001     int aExp;
4002     uint64_t aSig;
4003     uint32_t zSig;
4004 
4005     a = float64_squash_input_denormal(a, status);
4006 
4007     aSig = extractFloat64Frac(a);
4008     aExp = extractFloat64Exp(a);
4009     aSign = extractFloat64Sign(a);
4010     if (aExp == 0x7FF) {
4011         if (aSig) {
4012             /* Input is a NaN */
4013             if (!ieee) {
4014                 float_raise(float_flag_invalid, status);
4015                 return packFloat16(aSign, 0, 0);
4016             }
4017             return commonNaNToFloat16(
4018                 float64ToCommonNaN(a, status), status);
4019         }
4020         /* Infinity */
4021         if (!ieee) {
4022             float_raise(float_flag_invalid, status);
4023             return packFloat16(aSign, 0x1f, 0x3ff);
4024         }
4025         return packFloat16(aSign, 0x1f, 0);
4026     }
4027     shift64RightJamming(aSig, 29, &aSig);
4028     zSig = aSig;
4029     if (aExp == 0 && zSig == 0) {
4030         return packFloat16(aSign, 0, 0);
4031     }
4032     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
4033      * even if the input is denormal; however this is harmless because
4034      * the largest possible single-precision denormal is still smaller
4035      * than the smallest representable half-precision denormal, and so we
4036      * will end up ignoring aSig and returning via the "always return zero"
4037      * codepath.
4038      */
4039     zSig |= 0x00800000;
4040     aExp -= 0x3F1;
4041 
4042     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
4043 }
4044 
4045 /*----------------------------------------------------------------------------
4046 | Returns the result of converting the double-precision floating-point value
4047 | `a' to the extended double-precision floating-point format.  The conversion
4048 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4049 | Arithmetic.
4050 *----------------------------------------------------------------------------*/
4051 
4052 floatx80 float64_to_floatx80(float64 a, float_status *status)
4053 {
4054     flag aSign;
4055     int aExp;
4056     uint64_t aSig;
4057 
4058     a = float64_squash_input_denormal(a, status);
4059     aSig = extractFloat64Frac( a );
4060     aExp = extractFloat64Exp( a );
4061     aSign = extractFloat64Sign( a );
4062     if ( aExp == 0x7FF ) {
4063         if (aSig) {
4064             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4065         }
4066         return packFloatx80(aSign,
4067                             floatx80_infinity_high,
4068                             floatx80_infinity_low);
4069     }
4070     if ( aExp == 0 ) {
4071         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4072         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4073     }
4074     return
4075         packFloatx80(
4076             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4077 
4078 }
4079 
4080 /*----------------------------------------------------------------------------
4081 | Returns the result of converting the double-precision floating-point value
4082 | `a' to the quadruple-precision floating-point format.  The conversion is
4083 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4084 | Arithmetic.
4085 *----------------------------------------------------------------------------*/
4086 
4087 float128 float64_to_float128(float64 a, float_status *status)
4088 {
4089     flag aSign;
4090     int aExp;
4091     uint64_t aSig, zSig0, zSig1;
4092 
4093     a = float64_squash_input_denormal(a, status);
4094     aSig = extractFloat64Frac( a );
4095     aExp = extractFloat64Exp( a );
4096     aSign = extractFloat64Sign( a );
4097     if ( aExp == 0x7FF ) {
4098         if (aSig) {
4099             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4100         }
4101         return packFloat128( aSign, 0x7FFF, 0, 0 );
4102     }
4103     if ( aExp == 0 ) {
4104         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4105         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4106         --aExp;
4107     }
4108     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4109     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4110 
4111 }
4112 
4113 
4114 /*----------------------------------------------------------------------------
4115 | Returns the remainder of the double-precision floating-point value `a'
4116 | with respect to the corresponding value `b'.  The operation is performed
4117 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4118 *----------------------------------------------------------------------------*/
4119 
4120 float64 float64_rem(float64 a, float64 b, float_status *status)
4121 {
4122     flag aSign, zSign;
4123     int aExp, bExp, expDiff;
4124     uint64_t aSig, bSig;
4125     uint64_t q, alternateASig;
4126     int64_t sigMean;
4127 
4128     a = float64_squash_input_denormal(a, status);
4129     b = float64_squash_input_denormal(b, status);
4130     aSig = extractFloat64Frac( a );
4131     aExp = extractFloat64Exp( a );
4132     aSign = extractFloat64Sign( a );
4133     bSig = extractFloat64Frac( b );
4134     bExp = extractFloat64Exp( b );
4135     if ( aExp == 0x7FF ) {
4136         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4137             return propagateFloat64NaN(a, b, status);
4138         }
4139         float_raise(float_flag_invalid, status);
4140         return float64_default_nan(status);
4141     }
4142     if ( bExp == 0x7FF ) {
4143         if (bSig) {
4144             return propagateFloat64NaN(a, b, status);
4145         }
4146         return a;
4147     }
4148     if ( bExp == 0 ) {
4149         if ( bSig == 0 ) {
4150             float_raise(float_flag_invalid, status);
4151             return float64_default_nan(status);
4152         }
4153         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4154     }
4155     if ( aExp == 0 ) {
4156         if ( aSig == 0 ) return a;
4157         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4158     }
4159     expDiff = aExp - bExp;
4160     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4161     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4162     if ( expDiff < 0 ) {
4163         if ( expDiff < -1 ) return a;
4164         aSig >>= 1;
4165     }
4166     q = ( bSig <= aSig );
4167     if ( q ) aSig -= bSig;
4168     expDiff -= 64;
4169     while ( 0 < expDiff ) {
4170         q = estimateDiv128To64( aSig, 0, bSig );
4171         q = ( 2 < q ) ? q - 2 : 0;
4172         aSig = - ( ( bSig>>2 ) * q );
4173         expDiff -= 62;
4174     }
4175     expDiff += 64;
4176     if ( 0 < expDiff ) {
4177         q = estimateDiv128To64( aSig, 0, bSig );
4178         q = ( 2 < q ) ? q - 2 : 0;
4179         q >>= 64 - expDiff;
4180         bSig >>= 2;
4181         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4182     }
4183     else {
4184         aSig >>= 2;
4185         bSig >>= 2;
4186     }
4187     do {
4188         alternateASig = aSig;
4189         ++q;
4190         aSig -= bSig;
4191     } while ( 0 <= (int64_t) aSig );
4192     sigMean = aSig + alternateASig;
4193     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4194         aSig = alternateASig;
4195     }
4196     zSign = ( (int64_t) aSig < 0 );
4197     if ( zSign ) aSig = - aSig;
4198     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4199 
4200 }
4201 
4202 /*----------------------------------------------------------------------------
4203 | Returns the binary log of the double-precision floating-point value `a'.
4204 | The operation is performed according to the IEC/IEEE Standard for Binary
4205 | Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207 float64 float64_log2(float64 a, float_status *status)
4208 {
4209     flag aSign, zSign;
4210     int aExp;
4211     uint64_t aSig, aSig0, aSig1, zSig, i;
4212     a = float64_squash_input_denormal(a, status);
4213 
4214     aSig = extractFloat64Frac( a );
4215     aExp = extractFloat64Exp( a );
4216     aSign = extractFloat64Sign( a );
4217 
4218     if ( aExp == 0 ) {
4219         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4220         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4221     }
4222     if ( aSign ) {
4223         float_raise(float_flag_invalid, status);
4224         return float64_default_nan(status);
4225     }
4226     if ( aExp == 0x7FF ) {
4227         if (aSig) {
4228             return propagateFloat64NaN(a, float64_zero, status);
4229         }
4230         return a;
4231     }
4232 
4233     aExp -= 0x3FF;
4234     aSig |= LIT64( 0x0010000000000000 );
4235     zSign = aExp < 0;
4236     zSig = (uint64_t)aExp << 52;
4237     for (i = 1LL << 51; i > 0; i >>= 1) {
4238         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4239         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4240         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4241             aSig >>= 1;
4242             zSig |= i;
4243         }
4244     }
4245 
4246     if ( zSign )
4247         zSig = -zSig;
4248     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4249 }
4250 
4251 /*----------------------------------------------------------------------------
4252 | Returns 1 if the double-precision floating-point value `a' is equal to the
4253 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4254 | if either operand is a NaN.  Otherwise, the comparison is performed
4255 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4256 *----------------------------------------------------------------------------*/
4257 
4258 int float64_eq(float64 a, float64 b, float_status *status)
4259 {
4260     uint64_t av, bv;
4261     a = float64_squash_input_denormal(a, status);
4262     b = float64_squash_input_denormal(b, status);
4263 
4264     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4265          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4266        ) {
4267         float_raise(float_flag_invalid, status);
4268         return 0;
4269     }
4270     av = float64_val(a);
4271     bv = float64_val(b);
4272     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4273 
4274 }
4275 
4276 /*----------------------------------------------------------------------------
4277 | Returns 1 if the double-precision floating-point value `a' is less than or
4278 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4279 | exception is raised if either operand is a NaN.  The comparison is performed
4280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4281 *----------------------------------------------------------------------------*/
4282 
4283 int float64_le(float64 a, float64 b, float_status *status)
4284 {
4285     flag aSign, bSign;
4286     uint64_t av, bv;
4287     a = float64_squash_input_denormal(a, status);
4288     b = float64_squash_input_denormal(b, status);
4289 
4290     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4291          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4292        ) {
4293         float_raise(float_flag_invalid, status);
4294         return 0;
4295     }
4296     aSign = extractFloat64Sign( a );
4297     bSign = extractFloat64Sign( b );
4298     av = float64_val(a);
4299     bv = float64_val(b);
4300     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4301     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4302 
4303 }
4304 
4305 /*----------------------------------------------------------------------------
4306 | Returns 1 if the double-precision floating-point value `a' is less than
4307 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4308 | raised if either operand is a NaN.  The comparison is performed according
4309 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4310 *----------------------------------------------------------------------------*/
4311 
4312 int float64_lt(float64 a, float64 b, float_status *status)
4313 {
4314     flag aSign, bSign;
4315     uint64_t av, bv;
4316 
4317     a = float64_squash_input_denormal(a, status);
4318     b = float64_squash_input_denormal(b, status);
4319     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4320          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4321        ) {
4322         float_raise(float_flag_invalid, status);
4323         return 0;
4324     }
4325     aSign = extractFloat64Sign( a );
4326     bSign = extractFloat64Sign( b );
4327     av = float64_val(a);
4328     bv = float64_val(b);
4329     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4330     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4331 
4332 }
4333 
4334 /*----------------------------------------------------------------------------
4335 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4336 | be compared, and 0 otherwise.  The invalid exception is raised if either
4337 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4338 | Standard for Binary Floating-Point Arithmetic.
4339 *----------------------------------------------------------------------------*/
4340 
4341 int float64_unordered(float64 a, float64 b, float_status *status)
4342 {
4343     a = float64_squash_input_denormal(a, status);
4344     b = float64_squash_input_denormal(b, status);
4345 
4346     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4347          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4348        ) {
4349         float_raise(float_flag_invalid, status);
4350         return 1;
4351     }
4352     return 0;
4353 }
4354 
4355 /*----------------------------------------------------------------------------
4356 | Returns 1 if the double-precision floating-point value `a' is equal to the
4357 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4358 | exception.The comparison is performed according to the IEC/IEEE Standard
4359 | for Binary Floating-Point Arithmetic.
4360 *----------------------------------------------------------------------------*/
4361 
4362 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4363 {
4364     uint64_t av, bv;
4365     a = float64_squash_input_denormal(a, status);
4366     b = float64_squash_input_denormal(b, status);
4367 
4368     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4369          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4370        ) {
4371         if (float64_is_signaling_nan(a, status)
4372          || float64_is_signaling_nan(b, status)) {
4373             float_raise(float_flag_invalid, status);
4374         }
4375         return 0;
4376     }
4377     av = float64_val(a);
4378     bv = float64_val(b);
4379     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4380 
4381 }
4382 
4383 /*----------------------------------------------------------------------------
4384 | Returns 1 if the double-precision floating-point value `a' is less than or
4385 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4386 | cause an exception.  Otherwise, the comparison is performed according to the
4387 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4388 *----------------------------------------------------------------------------*/
4389 
4390 int float64_le_quiet(float64 a, float64 b, float_status *status)
4391 {
4392     flag aSign, bSign;
4393     uint64_t av, bv;
4394     a = float64_squash_input_denormal(a, status);
4395     b = float64_squash_input_denormal(b, status);
4396 
4397     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4398          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4399        ) {
4400         if (float64_is_signaling_nan(a, status)
4401          || float64_is_signaling_nan(b, status)) {
4402             float_raise(float_flag_invalid, status);
4403         }
4404         return 0;
4405     }
4406     aSign = extractFloat64Sign( a );
4407     bSign = extractFloat64Sign( b );
4408     av = float64_val(a);
4409     bv = float64_val(b);
4410     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4411     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4412 
4413 }
4414 
4415 /*----------------------------------------------------------------------------
4416 | Returns 1 if the double-precision floating-point value `a' is less than
4417 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4418 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4419 | Standard for Binary Floating-Point Arithmetic.
4420 *----------------------------------------------------------------------------*/
4421 
4422 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4423 {
4424     flag aSign, bSign;
4425     uint64_t av, bv;
4426     a = float64_squash_input_denormal(a, status);
4427     b = float64_squash_input_denormal(b, status);
4428 
4429     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4430          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4431        ) {
4432         if (float64_is_signaling_nan(a, status)
4433          || float64_is_signaling_nan(b, status)) {
4434             float_raise(float_flag_invalid, status);
4435         }
4436         return 0;
4437     }
4438     aSign = extractFloat64Sign( a );
4439     bSign = extractFloat64Sign( b );
4440     av = float64_val(a);
4441     bv = float64_val(b);
4442     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4443     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4444 
4445 }
4446 
4447 /*----------------------------------------------------------------------------
4448 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4449 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4450 | comparison is performed according to the IEC/IEEE Standard for Binary
4451 | Floating-Point Arithmetic.
4452 *----------------------------------------------------------------------------*/
4453 
4454 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4455 {
4456     a = float64_squash_input_denormal(a, status);
4457     b = float64_squash_input_denormal(b, status);
4458 
4459     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4460          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4461        ) {
4462         if (float64_is_signaling_nan(a, status)
4463          || float64_is_signaling_nan(b, status)) {
4464             float_raise(float_flag_invalid, status);
4465         }
4466         return 1;
4467     }
4468     return 0;
4469 }
4470 
4471 /*----------------------------------------------------------------------------
4472 | Returns the result of converting the extended double-precision floating-
4473 | point value `a' to the 32-bit two's complement integer format.  The
4474 | conversion is performed according to the IEC/IEEE Standard for Binary
4475 | Floating-Point Arithmetic---which means in particular that the conversion
4476 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4477 | largest positive integer is returned.  Otherwise, if the conversion
4478 | overflows, the largest integer with the same sign as `a' is returned.
4479 *----------------------------------------------------------------------------*/
4480 
4481 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4482 {
4483     flag aSign;
4484     int32_t aExp, shiftCount;
4485     uint64_t aSig;
4486 
4487     if (floatx80_invalid_encoding(a)) {
4488         float_raise(float_flag_invalid, status);
4489         return 1 << 31;
4490     }
4491     aSig = extractFloatx80Frac( a );
4492     aExp = extractFloatx80Exp( a );
4493     aSign = extractFloatx80Sign( a );
4494     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4495     shiftCount = 0x4037 - aExp;
4496     if ( shiftCount <= 0 ) shiftCount = 1;
4497     shift64RightJamming( aSig, shiftCount, &aSig );
4498     return roundAndPackInt32(aSign, aSig, status);
4499 
4500 }
4501 
4502 /*----------------------------------------------------------------------------
4503 | Returns the result of converting the extended double-precision floating-
4504 | point value `a' to the 32-bit two's complement integer format.  The
4505 | conversion is performed according to the IEC/IEEE Standard for Binary
4506 | Floating-Point Arithmetic, except that the conversion is always rounded
4507 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4508 | Otherwise, if the conversion overflows, the largest integer with the same
4509 | sign as `a' is returned.
4510 *----------------------------------------------------------------------------*/
4511 
4512 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4513 {
4514     flag aSign;
4515     int32_t aExp, shiftCount;
4516     uint64_t aSig, savedASig;
4517     int32_t z;
4518 
4519     if (floatx80_invalid_encoding(a)) {
4520         float_raise(float_flag_invalid, status);
4521         return 1 << 31;
4522     }
4523     aSig = extractFloatx80Frac( a );
4524     aExp = extractFloatx80Exp( a );
4525     aSign = extractFloatx80Sign( a );
4526     if ( 0x401E < aExp ) {
4527         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4528         goto invalid;
4529     }
4530     else if ( aExp < 0x3FFF ) {
4531         if (aExp || aSig) {
4532             status->float_exception_flags |= float_flag_inexact;
4533         }
4534         return 0;
4535     }
4536     shiftCount = 0x403E - aExp;
4537     savedASig = aSig;
4538     aSig >>= shiftCount;
4539     z = aSig;
4540     if ( aSign ) z = - z;
4541     if ( ( z < 0 ) ^ aSign ) {
4542  invalid:
4543         float_raise(float_flag_invalid, status);
4544         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4545     }
4546     if ( ( aSig<<shiftCount ) != savedASig ) {
4547         status->float_exception_flags |= float_flag_inexact;
4548     }
4549     return z;
4550 
4551 }
4552 
4553 /*----------------------------------------------------------------------------
4554 | Returns the result of converting the extended double-precision floating-
4555 | point value `a' to the 64-bit two's complement integer format.  The
4556 | conversion is performed according to the IEC/IEEE Standard for Binary
4557 | Floating-Point Arithmetic---which means in particular that the conversion
4558 | is rounded according to the current rounding mode.  If `a' is a NaN,
4559 | the largest positive integer is returned.  Otherwise, if the conversion
4560 | overflows, the largest integer with the same sign as `a' is returned.
4561 *----------------------------------------------------------------------------*/
4562 
4563 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4564 {
4565     flag aSign;
4566     int32_t aExp, shiftCount;
4567     uint64_t aSig, aSigExtra;
4568 
4569     if (floatx80_invalid_encoding(a)) {
4570         float_raise(float_flag_invalid, status);
4571         return 1ULL << 63;
4572     }
4573     aSig = extractFloatx80Frac( a );
4574     aExp = extractFloatx80Exp( a );
4575     aSign = extractFloatx80Sign( a );
4576     shiftCount = 0x403E - aExp;
4577     if ( shiftCount <= 0 ) {
4578         if ( shiftCount ) {
4579             float_raise(float_flag_invalid, status);
4580             if (!aSign || floatx80_is_any_nan(a)) {
4581                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4582             }
4583             return (int64_t) LIT64( 0x8000000000000000 );
4584         }
4585         aSigExtra = 0;
4586     }
4587     else {
4588         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4589     }
4590     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4591 
4592 }
4593 
4594 /*----------------------------------------------------------------------------
4595 | Returns the result of converting the extended double-precision floating-
4596 | point value `a' to the 64-bit two's complement integer format.  The
4597 | conversion is performed according to the IEC/IEEE Standard for Binary
4598 | Floating-Point Arithmetic, except that the conversion is always rounded
4599 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4600 | Otherwise, if the conversion overflows, the largest integer with the same
4601 | sign as `a' is returned.
4602 *----------------------------------------------------------------------------*/
4603 
4604 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4605 {
4606     flag aSign;
4607     int32_t aExp, shiftCount;
4608     uint64_t aSig;
4609     int64_t z;
4610 
4611     if (floatx80_invalid_encoding(a)) {
4612         float_raise(float_flag_invalid, status);
4613         return 1ULL << 63;
4614     }
4615     aSig = extractFloatx80Frac( a );
4616     aExp = extractFloatx80Exp( a );
4617     aSign = extractFloatx80Sign( a );
4618     shiftCount = aExp - 0x403E;
4619     if ( 0 <= shiftCount ) {
4620         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4621         if ( ( a.high != 0xC03E ) || aSig ) {
4622             float_raise(float_flag_invalid, status);
4623             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4624                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4625             }
4626         }
4627         return (int64_t) LIT64( 0x8000000000000000 );
4628     }
4629     else if ( aExp < 0x3FFF ) {
4630         if (aExp | aSig) {
4631             status->float_exception_flags |= float_flag_inexact;
4632         }
4633         return 0;
4634     }
4635     z = aSig>>( - shiftCount );
4636     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4637         status->float_exception_flags |= float_flag_inexact;
4638     }
4639     if ( aSign ) z = - z;
4640     return z;
4641 
4642 }
4643 
4644 /*----------------------------------------------------------------------------
4645 | Returns the result of converting the extended double-precision floating-
4646 | point value `a' to the single-precision floating-point format.  The
4647 | conversion is performed according to the IEC/IEEE Standard for Binary
4648 | Floating-Point Arithmetic.
4649 *----------------------------------------------------------------------------*/
4650 
4651 float32 floatx80_to_float32(floatx80 a, float_status *status)
4652 {
4653     flag aSign;
4654     int32_t aExp;
4655     uint64_t aSig;
4656 
4657     if (floatx80_invalid_encoding(a)) {
4658         float_raise(float_flag_invalid, status);
4659         return float32_default_nan(status);
4660     }
4661     aSig = extractFloatx80Frac( a );
4662     aExp = extractFloatx80Exp( a );
4663     aSign = extractFloatx80Sign( a );
4664     if ( aExp == 0x7FFF ) {
4665         if ( (uint64_t) ( aSig<<1 ) ) {
4666             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4667         }
4668         return packFloat32( aSign, 0xFF, 0 );
4669     }
4670     shift64RightJamming( aSig, 33, &aSig );
4671     if ( aExp || aSig ) aExp -= 0x3F81;
4672     return roundAndPackFloat32(aSign, aExp, aSig, status);
4673 
4674 }
4675 
4676 /*----------------------------------------------------------------------------
4677 | Returns the result of converting the extended double-precision floating-
4678 | point value `a' to the double-precision floating-point format.  The
4679 | conversion is performed according to the IEC/IEEE Standard for Binary
4680 | Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4682 
4683 float64 floatx80_to_float64(floatx80 a, float_status *status)
4684 {
4685     flag aSign;
4686     int32_t aExp;
4687     uint64_t aSig, zSig;
4688 
4689     if (floatx80_invalid_encoding(a)) {
4690         float_raise(float_flag_invalid, status);
4691         return float64_default_nan(status);
4692     }
4693     aSig = extractFloatx80Frac( a );
4694     aExp = extractFloatx80Exp( a );
4695     aSign = extractFloatx80Sign( a );
4696     if ( aExp == 0x7FFF ) {
4697         if ( (uint64_t) ( aSig<<1 ) ) {
4698             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4699         }
4700         return packFloat64( aSign, 0x7FF, 0 );
4701     }
4702     shift64RightJamming( aSig, 1, &zSig );
4703     if ( aExp || aSig ) aExp -= 0x3C01;
4704     return roundAndPackFloat64(aSign, aExp, zSig, status);
4705 
4706 }
4707 
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of converting the extended double-precision floating-
4710 | point value `a' to the quadruple-precision floating-point format.  The
4711 | conversion is performed according to the IEC/IEEE Standard for Binary
4712 | Floating-Point Arithmetic.
4713 *----------------------------------------------------------------------------*/
4714 
4715 float128 floatx80_to_float128(floatx80 a, float_status *status)
4716 {
4717     flag aSign;
4718     int aExp;
4719     uint64_t aSig, zSig0, zSig1;
4720 
4721     if (floatx80_invalid_encoding(a)) {
4722         float_raise(float_flag_invalid, status);
4723         return float128_default_nan(status);
4724     }
4725     aSig = extractFloatx80Frac( a );
4726     aExp = extractFloatx80Exp( a );
4727     aSign = extractFloatx80Sign( a );
4728     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4729         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4730     }
4731     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4732     return packFloat128( aSign, aExp, zSig0, zSig1 );
4733 
4734 }
4735 
4736 /*----------------------------------------------------------------------------
4737 | Rounds the extended double-precision floating-point value `a'
4738 | to the precision provided by floatx80_rounding_precision and returns the
4739 | result as an extended double-precision floating-point value.
4740 | The operation is performed according to the IEC/IEEE Standard for Binary
4741 | Floating-Point Arithmetic.
4742 *----------------------------------------------------------------------------*/
4743 
4744 floatx80 floatx80_round(floatx80 a, float_status *status)
4745 {
4746     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4747                                 extractFloatx80Sign(a),
4748                                 extractFloatx80Exp(a),
4749                                 extractFloatx80Frac(a), 0, status);
4750 }
4751 
4752 /*----------------------------------------------------------------------------
4753 | Rounds the extended double-precision floating-point value `a' to an integer,
4754 | and returns the result as an extended quadruple-precision floating-point
4755 | value.  The operation is performed according to the IEC/IEEE Standard for
4756 | Binary Floating-Point Arithmetic.
4757 *----------------------------------------------------------------------------*/
4758 
4759 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4760 {
4761     flag aSign;
4762     int32_t aExp;
4763     uint64_t lastBitMask, roundBitsMask;
4764     floatx80 z;
4765 
4766     if (floatx80_invalid_encoding(a)) {
4767         float_raise(float_flag_invalid, status);
4768         return floatx80_default_nan(status);
4769     }
4770     aExp = extractFloatx80Exp( a );
4771     if ( 0x403E <= aExp ) {
4772         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4773             return propagateFloatx80NaN(a, a, status);
4774         }
4775         return a;
4776     }
4777     if ( aExp < 0x3FFF ) {
4778         if (    ( aExp == 0 )
4779              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4780             return a;
4781         }
4782         status->float_exception_flags |= float_flag_inexact;
4783         aSign = extractFloatx80Sign( a );
4784         switch (status->float_rounding_mode) {
4785          case float_round_nearest_even:
4786             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4787                ) {
4788                 return
4789                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4790             }
4791             break;
4792         case float_round_ties_away:
4793             if (aExp == 0x3FFE) {
4794                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4795             }
4796             break;
4797          case float_round_down:
4798             return
4799                   aSign ?
4800                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4801                 : packFloatx80( 0, 0, 0 );
4802          case float_round_up:
4803             return
4804                   aSign ? packFloatx80( 1, 0, 0 )
4805                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4806         }
4807         return packFloatx80( aSign, 0, 0 );
4808     }
4809     lastBitMask = 1;
4810     lastBitMask <<= 0x403E - aExp;
4811     roundBitsMask = lastBitMask - 1;
4812     z = a;
4813     switch (status->float_rounding_mode) {
4814     case float_round_nearest_even:
4815         z.low += lastBitMask>>1;
4816         if ((z.low & roundBitsMask) == 0) {
4817             z.low &= ~lastBitMask;
4818         }
4819         break;
4820     case float_round_ties_away:
4821         z.low += lastBitMask >> 1;
4822         break;
4823     case float_round_to_zero:
4824         break;
4825     case float_round_up:
4826         if (!extractFloatx80Sign(z)) {
4827             z.low += roundBitsMask;
4828         }
4829         break;
4830     case float_round_down:
4831         if (extractFloatx80Sign(z)) {
4832             z.low += roundBitsMask;
4833         }
4834         break;
4835     default:
4836         abort();
4837     }
4838     z.low &= ~ roundBitsMask;
4839     if ( z.low == 0 ) {
4840         ++z.high;
4841         z.low = LIT64( 0x8000000000000000 );
4842     }
4843     if (z.low != a.low) {
4844         status->float_exception_flags |= float_flag_inexact;
4845     }
4846     return z;
4847 
4848 }
4849 
4850 /*----------------------------------------------------------------------------
4851 | Returns the result of adding the absolute values of the extended double-
4852 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4853 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4854 | The addition is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic.
4856 *----------------------------------------------------------------------------*/
4857 
4858 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4859                                 float_status *status)
4860 {
4861     int32_t aExp, bExp, zExp;
4862     uint64_t aSig, bSig, zSig0, zSig1;
4863     int32_t expDiff;
4864 
4865     aSig = extractFloatx80Frac( a );
4866     aExp = extractFloatx80Exp( a );
4867     bSig = extractFloatx80Frac( b );
4868     bExp = extractFloatx80Exp( b );
4869     expDiff = aExp - bExp;
4870     if ( 0 < expDiff ) {
4871         if ( aExp == 0x7FFF ) {
4872             if ((uint64_t)(aSig << 1)) {
4873                 return propagateFloatx80NaN(a, b, status);
4874             }
4875             return a;
4876         }
4877         if ( bExp == 0 ) --expDiff;
4878         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4879         zExp = aExp;
4880     }
4881     else if ( expDiff < 0 ) {
4882         if ( bExp == 0x7FFF ) {
4883             if ((uint64_t)(bSig << 1)) {
4884                 return propagateFloatx80NaN(a, b, status);
4885             }
4886             return packFloatx80(zSign,
4887                                 floatx80_infinity_high,
4888                                 floatx80_infinity_low);
4889         }
4890         if ( aExp == 0 ) ++expDiff;
4891         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4892         zExp = bExp;
4893     }
4894     else {
4895         if ( aExp == 0x7FFF ) {
4896             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4897                 return propagateFloatx80NaN(a, b, status);
4898             }
4899             return a;
4900         }
4901         zSig1 = 0;
4902         zSig0 = aSig + bSig;
4903         if ( aExp == 0 ) {
4904             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4905             goto roundAndPack;
4906         }
4907         zExp = aExp;
4908         goto shiftRight1;
4909     }
4910     zSig0 = aSig + bSig;
4911     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4912  shiftRight1:
4913     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4914     zSig0 |= LIT64( 0x8000000000000000 );
4915     ++zExp;
4916  roundAndPack:
4917     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4918                                 zSign, zExp, zSig0, zSig1, status);
4919 }
4920 
4921 /*----------------------------------------------------------------------------
4922 | Returns the result of subtracting the absolute values of the extended
4923 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4924 | difference is negated before being returned.  `zSign' is ignored if the
4925 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4926 | Standard for Binary Floating-Point Arithmetic.
4927 *----------------------------------------------------------------------------*/
4928 
4929 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4930                                 float_status *status)
4931 {
4932     int32_t aExp, bExp, zExp;
4933     uint64_t aSig, bSig, zSig0, zSig1;
4934     int32_t expDiff;
4935 
4936     aSig = extractFloatx80Frac( a );
4937     aExp = extractFloatx80Exp( a );
4938     bSig = extractFloatx80Frac( b );
4939     bExp = extractFloatx80Exp( b );
4940     expDiff = aExp - bExp;
4941     if ( 0 < expDiff ) goto aExpBigger;
4942     if ( expDiff < 0 ) goto bExpBigger;
4943     if ( aExp == 0x7FFF ) {
4944         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4945             return propagateFloatx80NaN(a, b, status);
4946         }
4947         float_raise(float_flag_invalid, status);
4948         return floatx80_default_nan(status);
4949     }
4950     if ( aExp == 0 ) {
4951         aExp = 1;
4952         bExp = 1;
4953     }
4954     zSig1 = 0;
4955     if ( bSig < aSig ) goto aBigger;
4956     if ( aSig < bSig ) goto bBigger;
4957     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4958  bExpBigger:
4959     if ( bExp == 0x7FFF ) {
4960         if ((uint64_t)(bSig << 1)) {
4961             return propagateFloatx80NaN(a, b, status);
4962         }
4963         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4964                             floatx80_infinity_low);
4965     }
4966     if ( aExp == 0 ) ++expDiff;
4967     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4968  bBigger:
4969     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4970     zExp = bExp;
4971     zSign ^= 1;
4972     goto normalizeRoundAndPack;
4973  aExpBigger:
4974     if ( aExp == 0x7FFF ) {
4975         if ((uint64_t)(aSig << 1)) {
4976             return propagateFloatx80NaN(a, b, status);
4977         }
4978         return a;
4979     }
4980     if ( bExp == 0 ) --expDiff;
4981     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4982  aBigger:
4983     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4984     zExp = aExp;
4985  normalizeRoundAndPack:
4986     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4987                                          zSign, zExp, zSig0, zSig1, status);
4988 }
4989 
4990 /*----------------------------------------------------------------------------
4991 | Returns the result of adding the extended double-precision floating-point
4992 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4993 | Standard for Binary Floating-Point Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995 
4996 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4997 {
4998     flag aSign, bSign;
4999 
5000     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5001         float_raise(float_flag_invalid, status);
5002         return floatx80_default_nan(status);
5003     }
5004     aSign = extractFloatx80Sign( a );
5005     bSign = extractFloatx80Sign( b );
5006     if ( aSign == bSign ) {
5007         return addFloatx80Sigs(a, b, aSign, status);
5008     }
5009     else {
5010         return subFloatx80Sigs(a, b, aSign, status);
5011     }
5012 
5013 }
5014 
5015 /*----------------------------------------------------------------------------
5016 | Returns the result of subtracting the extended double-precision floating-
5017 | point values `a' and `b'.  The operation is performed according to the
5018 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5019 *----------------------------------------------------------------------------*/
5020 
5021 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5022 {
5023     flag aSign, bSign;
5024 
5025     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5026         float_raise(float_flag_invalid, status);
5027         return floatx80_default_nan(status);
5028     }
5029     aSign = extractFloatx80Sign( a );
5030     bSign = extractFloatx80Sign( b );
5031     if ( aSign == bSign ) {
5032         return subFloatx80Sigs(a, b, aSign, status);
5033     }
5034     else {
5035         return addFloatx80Sigs(a, b, aSign, status);
5036     }
5037 
5038 }
5039 
5040 /*----------------------------------------------------------------------------
5041 | Returns the result of multiplying the extended double-precision floating-
5042 | point values `a' and `b'.  The operation is performed according to the
5043 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5044 *----------------------------------------------------------------------------*/
5045 
5046 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5047 {
5048     flag aSign, bSign, zSign;
5049     int32_t aExp, bExp, zExp;
5050     uint64_t aSig, bSig, zSig0, zSig1;
5051 
5052     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5053         float_raise(float_flag_invalid, status);
5054         return floatx80_default_nan(status);
5055     }
5056     aSig = extractFloatx80Frac( a );
5057     aExp = extractFloatx80Exp( a );
5058     aSign = extractFloatx80Sign( a );
5059     bSig = extractFloatx80Frac( b );
5060     bExp = extractFloatx80Exp( b );
5061     bSign = extractFloatx80Sign( b );
5062     zSign = aSign ^ bSign;
5063     if ( aExp == 0x7FFF ) {
5064         if (    (uint64_t) ( aSig<<1 )
5065              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5066             return propagateFloatx80NaN(a, b, status);
5067         }
5068         if ( ( bExp | bSig ) == 0 ) goto invalid;
5069         return packFloatx80(zSign, floatx80_infinity_high,
5070                                    floatx80_infinity_low);
5071     }
5072     if ( bExp == 0x7FFF ) {
5073         if ((uint64_t)(bSig << 1)) {
5074             return propagateFloatx80NaN(a, b, status);
5075         }
5076         if ( ( aExp | aSig ) == 0 ) {
5077  invalid:
5078             float_raise(float_flag_invalid, status);
5079             return floatx80_default_nan(status);
5080         }
5081         return packFloatx80(zSign, floatx80_infinity_high,
5082                                    floatx80_infinity_low);
5083     }
5084     if ( aExp == 0 ) {
5085         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5086         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5087     }
5088     if ( bExp == 0 ) {
5089         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5090         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5091     }
5092     zExp = aExp + bExp - 0x3FFE;
5093     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5094     if ( 0 < (int64_t) zSig0 ) {
5095         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5096         --zExp;
5097     }
5098     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5099                                 zSign, zExp, zSig0, zSig1, status);
5100 }
5101 
5102 /*----------------------------------------------------------------------------
5103 | Returns the result of dividing the extended double-precision floating-point
5104 | value `a' by the corresponding value `b'.  The operation is performed
5105 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5106 *----------------------------------------------------------------------------*/
5107 
5108 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5109 {
5110     flag aSign, bSign, zSign;
5111     int32_t aExp, bExp, zExp;
5112     uint64_t aSig, bSig, zSig0, zSig1;
5113     uint64_t rem0, rem1, rem2, term0, term1, term2;
5114 
5115     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5116         float_raise(float_flag_invalid, status);
5117         return floatx80_default_nan(status);
5118     }
5119     aSig = extractFloatx80Frac( a );
5120     aExp = extractFloatx80Exp( a );
5121     aSign = extractFloatx80Sign( a );
5122     bSig = extractFloatx80Frac( b );
5123     bExp = extractFloatx80Exp( b );
5124     bSign = extractFloatx80Sign( b );
5125     zSign = aSign ^ bSign;
5126     if ( aExp == 0x7FFF ) {
5127         if ((uint64_t)(aSig << 1)) {
5128             return propagateFloatx80NaN(a, b, status);
5129         }
5130         if ( bExp == 0x7FFF ) {
5131             if ((uint64_t)(bSig << 1)) {
5132                 return propagateFloatx80NaN(a, b, status);
5133             }
5134             goto invalid;
5135         }
5136         return packFloatx80(zSign, floatx80_infinity_high,
5137                                    floatx80_infinity_low);
5138     }
5139     if ( bExp == 0x7FFF ) {
5140         if ((uint64_t)(bSig << 1)) {
5141             return propagateFloatx80NaN(a, b, status);
5142         }
5143         return packFloatx80( zSign, 0, 0 );
5144     }
5145     if ( bExp == 0 ) {
5146         if ( bSig == 0 ) {
5147             if ( ( aExp | aSig ) == 0 ) {
5148  invalid:
5149                 float_raise(float_flag_invalid, status);
5150                 return floatx80_default_nan(status);
5151             }
5152             float_raise(float_flag_divbyzero, status);
5153             return packFloatx80(zSign, floatx80_infinity_high,
5154                                        floatx80_infinity_low);
5155         }
5156         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5157     }
5158     if ( aExp == 0 ) {
5159         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5160         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5161     }
5162     zExp = aExp - bExp + 0x3FFE;
5163     rem1 = 0;
5164     if ( bSig <= aSig ) {
5165         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5166         ++zExp;
5167     }
5168     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5169     mul64To128( bSig, zSig0, &term0, &term1 );
5170     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5171     while ( (int64_t) rem0 < 0 ) {
5172         --zSig0;
5173         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5174     }
5175     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5176     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5177         mul64To128( bSig, zSig1, &term1, &term2 );
5178         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5179         while ( (int64_t) rem1 < 0 ) {
5180             --zSig1;
5181             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5182         }
5183         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5184     }
5185     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5186                                 zSign, zExp, zSig0, zSig1, status);
5187 }
5188 
5189 /*----------------------------------------------------------------------------
5190 | Returns the remainder of the extended double-precision floating-point value
5191 | `a' with respect to the corresponding value `b'.  The operation is performed
5192 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5193 *----------------------------------------------------------------------------*/
5194 
5195 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5196 {
5197     flag aSign, zSign;
5198     int32_t aExp, bExp, expDiff;
5199     uint64_t aSig0, aSig1, bSig;
5200     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5201 
5202     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5203         float_raise(float_flag_invalid, status);
5204         return floatx80_default_nan(status);
5205     }
5206     aSig0 = extractFloatx80Frac( a );
5207     aExp = extractFloatx80Exp( a );
5208     aSign = extractFloatx80Sign( a );
5209     bSig = extractFloatx80Frac( b );
5210     bExp = extractFloatx80Exp( b );
5211     if ( aExp == 0x7FFF ) {
5212         if (    (uint64_t) ( aSig0<<1 )
5213              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5214             return propagateFloatx80NaN(a, b, status);
5215         }
5216         goto invalid;
5217     }
5218     if ( bExp == 0x7FFF ) {
5219         if ((uint64_t)(bSig << 1)) {
5220             return propagateFloatx80NaN(a, b, status);
5221         }
5222         return a;
5223     }
5224     if ( bExp == 0 ) {
5225         if ( bSig == 0 ) {
5226  invalid:
5227             float_raise(float_flag_invalid, status);
5228             return floatx80_default_nan(status);
5229         }
5230         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5231     }
5232     if ( aExp == 0 ) {
5233         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5234         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5235     }
5236     bSig |= LIT64( 0x8000000000000000 );
5237     zSign = aSign;
5238     expDiff = aExp - bExp;
5239     aSig1 = 0;
5240     if ( expDiff < 0 ) {
5241         if ( expDiff < -1 ) return a;
5242         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5243         expDiff = 0;
5244     }
5245     q = ( bSig <= aSig0 );
5246     if ( q ) aSig0 -= bSig;
5247     expDiff -= 64;
5248     while ( 0 < expDiff ) {
5249         q = estimateDiv128To64( aSig0, aSig1, bSig );
5250         q = ( 2 < q ) ? q - 2 : 0;
5251         mul64To128( bSig, q, &term0, &term1 );
5252         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5253         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5254         expDiff -= 62;
5255     }
5256     expDiff += 64;
5257     if ( 0 < expDiff ) {
5258         q = estimateDiv128To64( aSig0, aSig1, bSig );
5259         q = ( 2 < q ) ? q - 2 : 0;
5260         q >>= 64 - expDiff;
5261         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5262         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5263         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5264         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5265             ++q;
5266             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5267         }
5268     }
5269     else {
5270         term1 = 0;
5271         term0 = bSig;
5272     }
5273     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5274     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5275          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5276               && ( q & 1 ) )
5277        ) {
5278         aSig0 = alternateASig0;
5279         aSig1 = alternateASig1;
5280         zSign = ! zSign;
5281     }
5282     return
5283         normalizeRoundAndPackFloatx80(
5284             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5285 
5286 }
5287 
5288 /*----------------------------------------------------------------------------
5289 | Returns the square root of the extended double-precision floating-point
5290 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5291 | for Binary Floating-Point Arithmetic.
5292 *----------------------------------------------------------------------------*/
5293 
5294 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5295 {
5296     flag aSign;
5297     int32_t aExp, zExp;
5298     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5299     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5300 
5301     if (floatx80_invalid_encoding(a)) {
5302         float_raise(float_flag_invalid, status);
5303         return floatx80_default_nan(status);
5304     }
5305     aSig0 = extractFloatx80Frac( a );
5306     aExp = extractFloatx80Exp( a );
5307     aSign = extractFloatx80Sign( a );
5308     if ( aExp == 0x7FFF ) {
5309         if ((uint64_t)(aSig0 << 1)) {
5310             return propagateFloatx80NaN(a, a, status);
5311         }
5312         if ( ! aSign ) return a;
5313         goto invalid;
5314     }
5315     if ( aSign ) {
5316         if ( ( aExp | aSig0 ) == 0 ) return a;
5317  invalid:
5318         float_raise(float_flag_invalid, status);
5319         return floatx80_default_nan(status);
5320     }
5321     if ( aExp == 0 ) {
5322         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5323         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5324     }
5325     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5326     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5327     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5328     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5329     doubleZSig0 = zSig0<<1;
5330     mul64To128( zSig0, zSig0, &term0, &term1 );
5331     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5332     while ( (int64_t) rem0 < 0 ) {
5333         --zSig0;
5334         doubleZSig0 -= 2;
5335         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5336     }
5337     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5338     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5339         if ( zSig1 == 0 ) zSig1 = 1;
5340         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5341         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5342         mul64To128( zSig1, zSig1, &term2, &term3 );
5343         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5344         while ( (int64_t) rem1 < 0 ) {
5345             --zSig1;
5346             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5347             term3 |= 1;
5348             term2 |= doubleZSig0;
5349             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5350         }
5351         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5352     }
5353     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5354     zSig0 |= doubleZSig0;
5355     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5356                                 0, zExp, zSig0, zSig1, status);
5357 }
5358 
5359 /*----------------------------------------------------------------------------
5360 | Returns 1 if the extended double-precision floating-point value `a' is equal
5361 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5362 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5363 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5365 
5366 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5367 {
5368 
5369     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5370         || (extractFloatx80Exp(a) == 0x7FFF
5371             && (uint64_t) (extractFloatx80Frac(a) << 1))
5372         || (extractFloatx80Exp(b) == 0x7FFF
5373             && (uint64_t) (extractFloatx80Frac(b) << 1))
5374        ) {
5375         float_raise(float_flag_invalid, status);
5376         return 0;
5377     }
5378     return
5379            ( a.low == b.low )
5380         && (    ( a.high == b.high )
5381              || (    ( a.low == 0 )
5382                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5383            );
5384 
5385 }
5386 
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is
5389 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5390 | invalid exception is raised if either operand is a NaN.  The comparison is
5391 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5392 | Arithmetic.
5393 *----------------------------------------------------------------------------*/
5394 
5395 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5396 {
5397     flag aSign, bSign;
5398 
5399     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5400         || (extractFloatx80Exp(a) == 0x7FFF
5401             && (uint64_t) (extractFloatx80Frac(a) << 1))
5402         || (extractFloatx80Exp(b) == 0x7FFF
5403             && (uint64_t) (extractFloatx80Frac(b) << 1))
5404        ) {
5405         float_raise(float_flag_invalid, status);
5406         return 0;
5407     }
5408     aSign = extractFloatx80Sign( a );
5409     bSign = extractFloatx80Sign( b );
5410     if ( aSign != bSign ) {
5411         return
5412                aSign
5413             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5414                  == 0 );
5415     }
5416     return
5417           aSign ? le128( b.high, b.low, a.high, a.low )
5418         : le128( a.high, a.low, b.high, b.low );
5419 
5420 }
5421 
5422 /*----------------------------------------------------------------------------
5423 | Returns 1 if the extended double-precision floating-point value `a' is
5424 | less than the corresponding value `b', and 0 otherwise.  The invalid
5425 | exception is raised if either operand is a NaN.  The comparison is performed
5426 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5427 *----------------------------------------------------------------------------*/
5428 
5429 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5430 {
5431     flag aSign, bSign;
5432 
5433     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5434         || (extractFloatx80Exp(a) == 0x7FFF
5435             && (uint64_t) (extractFloatx80Frac(a) << 1))
5436         || (extractFloatx80Exp(b) == 0x7FFF
5437             && (uint64_t) (extractFloatx80Frac(b) << 1))
5438        ) {
5439         float_raise(float_flag_invalid, status);
5440         return 0;
5441     }
5442     aSign = extractFloatx80Sign( a );
5443     bSign = extractFloatx80Sign( b );
5444     if ( aSign != bSign ) {
5445         return
5446                aSign
5447             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5448                  != 0 );
5449     }
5450     return
5451           aSign ? lt128( b.high, b.low, a.high, a.low )
5452         : lt128( a.high, a.low, b.high, b.low );
5453 
5454 }
5455 
5456 /*----------------------------------------------------------------------------
5457 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5458 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5459 | either operand is a NaN.   The comparison is performed according to the
5460 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5461 *----------------------------------------------------------------------------*/
5462 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5463 {
5464     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5465         || (extractFloatx80Exp(a) == 0x7FFF
5466             && (uint64_t) (extractFloatx80Frac(a) << 1))
5467         || (extractFloatx80Exp(b) == 0x7FFF
5468             && (uint64_t) (extractFloatx80Frac(b) << 1))
5469        ) {
5470         float_raise(float_flag_invalid, status);
5471         return 1;
5472     }
5473     return 0;
5474 }
5475 
5476 /*----------------------------------------------------------------------------
5477 | Returns 1 if the extended double-precision floating-point value `a' is
5478 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5479 | cause an exception.  The comparison is performed according to the IEC/IEEE
5480 | Standard for Binary Floating-Point Arithmetic.
5481 *----------------------------------------------------------------------------*/
5482 
5483 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5484 {
5485 
5486     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5487         float_raise(float_flag_invalid, status);
5488         return 0;
5489     }
5490     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5491               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5492          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5493               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5494        ) {
5495         if (floatx80_is_signaling_nan(a, status)
5496          || floatx80_is_signaling_nan(b, status)) {
5497             float_raise(float_flag_invalid, status);
5498         }
5499         return 0;
5500     }
5501     return
5502            ( a.low == b.low )
5503         && (    ( a.high == b.high )
5504              || (    ( a.low == 0 )
5505                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5506            );
5507 
5508 }
5509 
5510 /*----------------------------------------------------------------------------
5511 | Returns 1 if the extended double-precision floating-point value `a' is less
5512 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5513 | do not cause an exception.  Otherwise, the comparison is performed according
5514 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5515 *----------------------------------------------------------------------------*/
5516 
5517 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5518 {
5519     flag aSign, bSign;
5520 
5521     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5522         float_raise(float_flag_invalid, status);
5523         return 0;
5524     }
5525     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5526               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5527          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5528               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5529        ) {
5530         if (floatx80_is_signaling_nan(a, status)
5531          || floatx80_is_signaling_nan(b, status)) {
5532             float_raise(float_flag_invalid, status);
5533         }
5534         return 0;
5535     }
5536     aSign = extractFloatx80Sign( a );
5537     bSign = extractFloatx80Sign( b );
5538     if ( aSign != bSign ) {
5539         return
5540                aSign
5541             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5542                  == 0 );
5543     }
5544     return
5545           aSign ? le128( b.high, b.low, a.high, a.low )
5546         : le128( a.high, a.low, b.high, b.low );
5547 
5548 }
5549 
5550 /*----------------------------------------------------------------------------
5551 | Returns 1 if the extended double-precision floating-point value `a' is less
5552 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5553 | an exception.  Otherwise, the comparison is performed according to the
5554 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5555 *----------------------------------------------------------------------------*/
5556 
5557 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5558 {
5559     flag aSign, bSign;
5560 
5561     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5562         float_raise(float_flag_invalid, status);
5563         return 0;
5564     }
5565     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5566               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5567          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5568               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5569        ) {
5570         if (floatx80_is_signaling_nan(a, status)
5571          || floatx80_is_signaling_nan(b, status)) {
5572             float_raise(float_flag_invalid, status);
5573         }
5574         return 0;
5575     }
5576     aSign = extractFloatx80Sign( a );
5577     bSign = extractFloatx80Sign( b );
5578     if ( aSign != bSign ) {
5579         return
5580                aSign
5581             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5582                  != 0 );
5583     }
5584     return
5585           aSign ? lt128( b.high, b.low, a.high, a.low )
5586         : lt128( a.high, a.low, b.high, b.low );
5587 
5588 }
5589 
5590 /*----------------------------------------------------------------------------
5591 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5592 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5593 | The comparison is performed according to the IEC/IEEE Standard for Binary
5594 | Floating-Point Arithmetic.
5595 *----------------------------------------------------------------------------*/
5596 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5597 {
5598     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5599         float_raise(float_flag_invalid, status);
5600         return 1;
5601     }
5602     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5603               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5604          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5605               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5606        ) {
5607         if (floatx80_is_signaling_nan(a, status)
5608          || floatx80_is_signaling_nan(b, status)) {
5609             float_raise(float_flag_invalid, status);
5610         }
5611         return 1;
5612     }
5613     return 0;
5614 }
5615 
5616 /*----------------------------------------------------------------------------
5617 | Returns the result of converting the quadruple-precision floating-point
5618 | value `a' to the 32-bit two's complement integer format.  The conversion
5619 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5620 | Arithmetic---which means in particular that the conversion is rounded
5621 | according to the current rounding mode.  If `a' is a NaN, the largest
5622 | positive integer is returned.  Otherwise, if the conversion overflows, the
5623 | largest integer with the same sign as `a' is returned.
5624 *----------------------------------------------------------------------------*/
5625 
5626 int32_t float128_to_int32(float128 a, float_status *status)
5627 {
5628     flag aSign;
5629     int32_t aExp, shiftCount;
5630     uint64_t aSig0, aSig1;
5631 
5632     aSig1 = extractFloat128Frac1( a );
5633     aSig0 = extractFloat128Frac0( a );
5634     aExp = extractFloat128Exp( a );
5635     aSign = extractFloat128Sign( a );
5636     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5637     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5638     aSig0 |= ( aSig1 != 0 );
5639     shiftCount = 0x4028 - aExp;
5640     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5641     return roundAndPackInt32(aSign, aSig0, status);
5642 
5643 }
5644 
5645 /*----------------------------------------------------------------------------
5646 | Returns the result of converting the quadruple-precision floating-point
5647 | value `a' to the 32-bit two's complement integer format.  The conversion
5648 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5649 | Arithmetic, except that the conversion is always rounded toward zero.  If
5650 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5651 | conversion overflows, the largest integer with the same sign as `a' is
5652 | returned.
5653 *----------------------------------------------------------------------------*/
5654 
5655 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5656 {
5657     flag aSign;
5658     int32_t aExp, shiftCount;
5659     uint64_t aSig0, aSig1, savedASig;
5660     int32_t z;
5661 
5662     aSig1 = extractFloat128Frac1( a );
5663     aSig0 = extractFloat128Frac0( a );
5664     aExp = extractFloat128Exp( a );
5665     aSign = extractFloat128Sign( a );
5666     aSig0 |= ( aSig1 != 0 );
5667     if ( 0x401E < aExp ) {
5668         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5669         goto invalid;
5670     }
5671     else if ( aExp < 0x3FFF ) {
5672         if (aExp || aSig0) {
5673             status->float_exception_flags |= float_flag_inexact;
5674         }
5675         return 0;
5676     }
5677     aSig0 |= LIT64( 0x0001000000000000 );
5678     shiftCount = 0x402F - aExp;
5679     savedASig = aSig0;
5680     aSig0 >>= shiftCount;
5681     z = aSig0;
5682     if ( aSign ) z = - z;
5683     if ( ( z < 0 ) ^ aSign ) {
5684  invalid:
5685         float_raise(float_flag_invalid, status);
5686         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5687     }
5688     if ( ( aSig0<<shiftCount ) != savedASig ) {
5689         status->float_exception_flags |= float_flag_inexact;
5690     }
5691     return z;
5692 
5693 }
5694 
5695 /*----------------------------------------------------------------------------
5696 | Returns the result of converting the quadruple-precision floating-point
5697 | value `a' to the 64-bit two's complement integer format.  The conversion
5698 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5699 | Arithmetic---which means in particular that the conversion is rounded
5700 | according to the current rounding mode.  If `a' is a NaN, the largest
5701 | positive integer is returned.  Otherwise, if the conversion overflows, the
5702 | largest integer with the same sign as `a' is returned.
5703 *----------------------------------------------------------------------------*/
5704 
5705 int64_t float128_to_int64(float128 a, float_status *status)
5706 {
5707     flag aSign;
5708     int32_t aExp, shiftCount;
5709     uint64_t aSig0, aSig1;
5710 
5711     aSig1 = extractFloat128Frac1( a );
5712     aSig0 = extractFloat128Frac0( a );
5713     aExp = extractFloat128Exp( a );
5714     aSign = extractFloat128Sign( a );
5715     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5716     shiftCount = 0x402F - aExp;
5717     if ( shiftCount <= 0 ) {
5718         if ( 0x403E < aExp ) {
5719             float_raise(float_flag_invalid, status);
5720             if (    ! aSign
5721                  || (    ( aExp == 0x7FFF )
5722                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5723                     )
5724                ) {
5725                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5726             }
5727             return (int64_t) LIT64( 0x8000000000000000 );
5728         }
5729         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5730     }
5731     else {
5732         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5733     }
5734     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5735 
5736 }
5737 
5738 /*----------------------------------------------------------------------------
5739 | Returns the result of converting the quadruple-precision floating-point
5740 | value `a' to the 64-bit two's complement integer format.  The conversion
5741 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742 | Arithmetic, except that the conversion is always rounded toward zero.
5743 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5744 | the conversion overflows, the largest integer with the same sign as `a' is
5745 | returned.
5746 *----------------------------------------------------------------------------*/
5747 
5748 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5749 {
5750     flag aSign;
5751     int32_t aExp, shiftCount;
5752     uint64_t aSig0, aSig1;
5753     int64_t z;
5754 
5755     aSig1 = extractFloat128Frac1( a );
5756     aSig0 = extractFloat128Frac0( a );
5757     aExp = extractFloat128Exp( a );
5758     aSign = extractFloat128Sign( a );
5759     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5760     shiftCount = aExp - 0x402F;
5761     if ( 0 < shiftCount ) {
5762         if ( 0x403E <= aExp ) {
5763             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5764             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5765                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5766                 if (aSig1) {
5767                     status->float_exception_flags |= float_flag_inexact;
5768                 }
5769             }
5770             else {
5771                 float_raise(float_flag_invalid, status);
5772                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5773                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5774                 }
5775             }
5776             return (int64_t) LIT64( 0x8000000000000000 );
5777         }
5778         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5779         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5780             status->float_exception_flags |= float_flag_inexact;
5781         }
5782     }
5783     else {
5784         if ( aExp < 0x3FFF ) {
5785             if ( aExp | aSig0 | aSig1 ) {
5786                 status->float_exception_flags |= float_flag_inexact;
5787             }
5788             return 0;
5789         }
5790         z = aSig0>>( - shiftCount );
5791         if (    aSig1
5792              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5793             status->float_exception_flags |= float_flag_inexact;
5794         }
5795     }
5796     if ( aSign ) z = - z;
5797     return z;
5798 
5799 }
5800 
5801 /*----------------------------------------------------------------------------
5802 | Returns the result of converting the quadruple-precision floating-point value
5803 | `a' to the 64-bit unsigned integer format.  The conversion is
5804 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5805 | Arithmetic---which means in particular that the conversion is rounded
5806 | according to the current rounding mode.  If `a' is a NaN, the largest
5807 | positive integer is returned.  If the conversion overflows, the
5808 | largest unsigned integer is returned.  If 'a' is negative, the value is
5809 | rounded and zero is returned; negative values that do not round to zero
5810 | will raise the inexact exception.
5811 *----------------------------------------------------------------------------*/
5812 
5813 uint64_t float128_to_uint64(float128 a, float_status *status)
5814 {
5815     flag aSign;
5816     int aExp;
5817     int shiftCount;
5818     uint64_t aSig0, aSig1;
5819 
5820     aSig0 = extractFloat128Frac0(a);
5821     aSig1 = extractFloat128Frac1(a);
5822     aExp = extractFloat128Exp(a);
5823     aSign = extractFloat128Sign(a);
5824     if (aSign && (aExp > 0x3FFE)) {
5825         float_raise(float_flag_invalid, status);
5826         if (float128_is_any_nan(a)) {
5827             return LIT64(0xFFFFFFFFFFFFFFFF);
5828         } else {
5829             return 0;
5830         }
5831     }
5832     if (aExp) {
5833         aSig0 |= LIT64(0x0001000000000000);
5834     }
5835     shiftCount = 0x402F - aExp;
5836     if (shiftCount <= 0) {
5837         if (0x403E < aExp) {
5838             float_raise(float_flag_invalid, status);
5839             return LIT64(0xFFFFFFFFFFFFFFFF);
5840         }
5841         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5842     } else {
5843         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5844     }
5845     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5846 }
5847 
5848 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5849 {
5850     uint64_t v;
5851     signed char current_rounding_mode = status->float_rounding_mode;
5852 
5853     set_float_rounding_mode(float_round_to_zero, status);
5854     v = float128_to_uint64(a, status);
5855     set_float_rounding_mode(current_rounding_mode, status);
5856 
5857     return v;
5858 }
5859 
5860 /*----------------------------------------------------------------------------
5861 | Returns the result of converting the quadruple-precision floating-point
5862 | value `a' to the 32-bit unsigned integer format.  The conversion
5863 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5864 | Arithmetic except that the conversion is always rounded toward zero.
5865 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5866 | if the conversion overflows, the largest unsigned integer is returned.
5867 | If 'a' is negative, the value is rounded and zero is returned; negative
5868 | values that do not round to zero will raise the inexact exception.
5869 *----------------------------------------------------------------------------*/
5870 
5871 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5872 {
5873     uint64_t v;
5874     uint32_t res;
5875     int old_exc_flags = get_float_exception_flags(status);
5876 
5877     v = float128_to_uint64_round_to_zero(a, status);
5878     if (v > 0xffffffff) {
5879         res = 0xffffffff;
5880     } else {
5881         return v;
5882     }
5883     set_float_exception_flags(old_exc_flags, status);
5884     float_raise(float_flag_invalid, status);
5885     return res;
5886 }
5887 
5888 /*----------------------------------------------------------------------------
5889 | Returns the result of converting the quadruple-precision floating-point
5890 | value `a' to the single-precision floating-point format.  The conversion
5891 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5892 | Arithmetic.
5893 *----------------------------------------------------------------------------*/
5894 
5895 float32 float128_to_float32(float128 a, float_status *status)
5896 {
5897     flag aSign;
5898     int32_t aExp;
5899     uint64_t aSig0, aSig1;
5900     uint32_t zSig;
5901 
5902     aSig1 = extractFloat128Frac1( a );
5903     aSig0 = extractFloat128Frac0( a );
5904     aExp = extractFloat128Exp( a );
5905     aSign = extractFloat128Sign( a );
5906     if ( aExp == 0x7FFF ) {
5907         if ( aSig0 | aSig1 ) {
5908             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5909         }
5910         return packFloat32( aSign, 0xFF, 0 );
5911     }
5912     aSig0 |= ( aSig1 != 0 );
5913     shift64RightJamming( aSig0, 18, &aSig0 );
5914     zSig = aSig0;
5915     if ( aExp || zSig ) {
5916         zSig |= 0x40000000;
5917         aExp -= 0x3F81;
5918     }
5919     return roundAndPackFloat32(aSign, aExp, zSig, status);
5920 
5921 }
5922 
5923 /*----------------------------------------------------------------------------
5924 | Returns the result of converting the quadruple-precision floating-point
5925 | value `a' to the double-precision floating-point format.  The conversion
5926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5927 | Arithmetic.
5928 *----------------------------------------------------------------------------*/
5929 
5930 float64 float128_to_float64(float128 a, float_status *status)
5931 {
5932     flag aSign;
5933     int32_t aExp;
5934     uint64_t aSig0, aSig1;
5935 
5936     aSig1 = extractFloat128Frac1( a );
5937     aSig0 = extractFloat128Frac0( a );
5938     aExp = extractFloat128Exp( a );
5939     aSign = extractFloat128Sign( a );
5940     if ( aExp == 0x7FFF ) {
5941         if ( aSig0 | aSig1 ) {
5942             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5943         }
5944         return packFloat64( aSign, 0x7FF, 0 );
5945     }
5946     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5947     aSig0 |= ( aSig1 != 0 );
5948     if ( aExp || aSig0 ) {
5949         aSig0 |= LIT64( 0x4000000000000000 );
5950         aExp -= 0x3C01;
5951     }
5952     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5953 
5954 }
5955 
5956 /*----------------------------------------------------------------------------
5957 | Returns the result of converting the quadruple-precision floating-point
5958 | value `a' to the extended double-precision floating-point format.  The
5959 | conversion is performed according to the IEC/IEEE Standard for Binary
5960 | Floating-Point Arithmetic.
5961 *----------------------------------------------------------------------------*/
5962 
5963 floatx80 float128_to_floatx80(float128 a, float_status *status)
5964 {
5965     flag aSign;
5966     int32_t aExp;
5967     uint64_t aSig0, aSig1;
5968 
5969     aSig1 = extractFloat128Frac1( a );
5970     aSig0 = extractFloat128Frac0( a );
5971     aExp = extractFloat128Exp( a );
5972     aSign = extractFloat128Sign( a );
5973     if ( aExp == 0x7FFF ) {
5974         if ( aSig0 | aSig1 ) {
5975             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5976         }
5977         return packFloatx80(aSign, floatx80_infinity_high,
5978                                    floatx80_infinity_low);
5979     }
5980     if ( aExp == 0 ) {
5981         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5982         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5983     }
5984     else {
5985         aSig0 |= LIT64( 0x0001000000000000 );
5986     }
5987     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5988     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5989 
5990 }
5991 
5992 /*----------------------------------------------------------------------------
5993 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5994 | returns the result as a quadruple-precision floating-point value.  The
5995 | operation is performed according to the IEC/IEEE Standard for Binary
5996 | Floating-Point Arithmetic.
5997 *----------------------------------------------------------------------------*/
5998 
5999 float128 float128_round_to_int(float128 a, float_status *status)
6000 {
6001     flag aSign;
6002     int32_t aExp;
6003     uint64_t lastBitMask, roundBitsMask;
6004     float128 z;
6005 
6006     aExp = extractFloat128Exp( a );
6007     if ( 0x402F <= aExp ) {
6008         if ( 0x406F <= aExp ) {
6009             if (    ( aExp == 0x7FFF )
6010                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6011                ) {
6012                 return propagateFloat128NaN(a, a, status);
6013             }
6014             return a;
6015         }
6016         lastBitMask = 1;
6017         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6018         roundBitsMask = lastBitMask - 1;
6019         z = a;
6020         switch (status->float_rounding_mode) {
6021         case float_round_nearest_even:
6022             if ( lastBitMask ) {
6023                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6024                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6025             }
6026             else {
6027                 if ( (int64_t) z.low < 0 ) {
6028                     ++z.high;
6029                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6030                 }
6031             }
6032             break;
6033         case float_round_ties_away:
6034             if (lastBitMask) {
6035                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6036             } else {
6037                 if ((int64_t) z.low < 0) {
6038                     ++z.high;
6039                 }
6040             }
6041             break;
6042         case float_round_to_zero:
6043             break;
6044         case float_round_up:
6045             if (!extractFloat128Sign(z)) {
6046                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6047             }
6048             break;
6049         case float_round_down:
6050             if (extractFloat128Sign(z)) {
6051                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6052             }
6053             break;
6054         default:
6055             abort();
6056         }
6057         z.low &= ~ roundBitsMask;
6058     }
6059     else {
6060         if ( aExp < 0x3FFF ) {
6061             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6062             status->float_exception_flags |= float_flag_inexact;
6063             aSign = extractFloat128Sign( a );
6064             switch (status->float_rounding_mode) {
6065              case float_round_nearest_even:
6066                 if (    ( aExp == 0x3FFE )
6067                      && (   extractFloat128Frac0( a )
6068                           | extractFloat128Frac1( a ) )
6069                    ) {
6070                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6071                 }
6072                 break;
6073             case float_round_ties_away:
6074                 if (aExp == 0x3FFE) {
6075                     return packFloat128(aSign, 0x3FFF, 0, 0);
6076                 }
6077                 break;
6078              case float_round_down:
6079                 return
6080                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6081                     : packFloat128( 0, 0, 0, 0 );
6082              case float_round_up:
6083                 return
6084                       aSign ? packFloat128( 1, 0, 0, 0 )
6085                     : packFloat128( 0, 0x3FFF, 0, 0 );
6086             }
6087             return packFloat128( aSign, 0, 0, 0 );
6088         }
6089         lastBitMask = 1;
6090         lastBitMask <<= 0x402F - aExp;
6091         roundBitsMask = lastBitMask - 1;
6092         z.low = 0;
6093         z.high = a.high;
6094         switch (status->float_rounding_mode) {
6095         case float_round_nearest_even:
6096             z.high += lastBitMask>>1;
6097             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6098                 z.high &= ~ lastBitMask;
6099             }
6100             break;
6101         case float_round_ties_away:
6102             z.high += lastBitMask>>1;
6103             break;
6104         case float_round_to_zero:
6105             break;
6106         case float_round_up:
6107             if (!extractFloat128Sign(z)) {
6108                 z.high |= ( a.low != 0 );
6109                 z.high += roundBitsMask;
6110             }
6111             break;
6112         case float_round_down:
6113             if (extractFloat128Sign(z)) {
6114                 z.high |= (a.low != 0);
6115                 z.high += roundBitsMask;
6116             }
6117             break;
6118         default:
6119             abort();
6120         }
6121         z.high &= ~ roundBitsMask;
6122     }
6123     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6124         status->float_exception_flags |= float_flag_inexact;
6125     }
6126     return z;
6127 
6128 }
6129 
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of adding the absolute values of the quadruple-precision
6132 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6133 | before being returned.  `zSign' is ignored if the result is a NaN.
6134 | The addition is performed according to the IEC/IEEE Standard for Binary
6135 | Floating-Point Arithmetic.
6136 *----------------------------------------------------------------------------*/
6137 
6138 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6139                                 float_status *status)
6140 {
6141     int32_t aExp, bExp, zExp;
6142     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6143     int32_t expDiff;
6144 
6145     aSig1 = extractFloat128Frac1( a );
6146     aSig0 = extractFloat128Frac0( a );
6147     aExp = extractFloat128Exp( a );
6148     bSig1 = extractFloat128Frac1( b );
6149     bSig0 = extractFloat128Frac0( b );
6150     bExp = extractFloat128Exp( b );
6151     expDiff = aExp - bExp;
6152     if ( 0 < expDiff ) {
6153         if ( aExp == 0x7FFF ) {
6154             if (aSig0 | aSig1) {
6155                 return propagateFloat128NaN(a, b, status);
6156             }
6157             return a;
6158         }
6159         if ( bExp == 0 ) {
6160             --expDiff;
6161         }
6162         else {
6163             bSig0 |= LIT64( 0x0001000000000000 );
6164         }
6165         shift128ExtraRightJamming(
6166             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6167         zExp = aExp;
6168     }
6169     else if ( expDiff < 0 ) {
6170         if ( bExp == 0x7FFF ) {
6171             if (bSig0 | bSig1) {
6172                 return propagateFloat128NaN(a, b, status);
6173             }
6174             return packFloat128( zSign, 0x7FFF, 0, 0 );
6175         }
6176         if ( aExp == 0 ) {
6177             ++expDiff;
6178         }
6179         else {
6180             aSig0 |= LIT64( 0x0001000000000000 );
6181         }
6182         shift128ExtraRightJamming(
6183             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6184         zExp = bExp;
6185     }
6186     else {
6187         if ( aExp == 0x7FFF ) {
6188             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6189                 return propagateFloat128NaN(a, b, status);
6190             }
6191             return a;
6192         }
6193         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6194         if ( aExp == 0 ) {
6195             if (status->flush_to_zero) {
6196                 if (zSig0 | zSig1) {
6197                     float_raise(float_flag_output_denormal, status);
6198                 }
6199                 return packFloat128(zSign, 0, 0, 0);
6200             }
6201             return packFloat128( zSign, 0, zSig0, zSig1 );
6202         }
6203         zSig2 = 0;
6204         zSig0 |= LIT64( 0x0002000000000000 );
6205         zExp = aExp;
6206         goto shiftRight1;
6207     }
6208     aSig0 |= LIT64( 0x0001000000000000 );
6209     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6210     --zExp;
6211     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6212     ++zExp;
6213  shiftRight1:
6214     shift128ExtraRightJamming(
6215         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6216  roundAndPack:
6217     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6218 
6219 }
6220 
6221 /*----------------------------------------------------------------------------
6222 | Returns the result of subtracting the absolute values of the quadruple-
6223 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6224 | difference is negated before being returned.  `zSign' is ignored if the
6225 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6226 | Standard for Binary Floating-Point Arithmetic.
6227 *----------------------------------------------------------------------------*/
6228 
6229 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6230                                 float_status *status)
6231 {
6232     int32_t aExp, bExp, zExp;
6233     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6234     int32_t expDiff;
6235 
6236     aSig1 = extractFloat128Frac1( a );
6237     aSig0 = extractFloat128Frac0( a );
6238     aExp = extractFloat128Exp( a );
6239     bSig1 = extractFloat128Frac1( b );
6240     bSig0 = extractFloat128Frac0( b );
6241     bExp = extractFloat128Exp( b );
6242     expDiff = aExp - bExp;
6243     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6244     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6245     if ( 0 < expDiff ) goto aExpBigger;
6246     if ( expDiff < 0 ) goto bExpBigger;
6247     if ( aExp == 0x7FFF ) {
6248         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6249             return propagateFloat128NaN(a, b, status);
6250         }
6251         float_raise(float_flag_invalid, status);
6252         return float128_default_nan(status);
6253     }
6254     if ( aExp == 0 ) {
6255         aExp = 1;
6256         bExp = 1;
6257     }
6258     if ( bSig0 < aSig0 ) goto aBigger;
6259     if ( aSig0 < bSig0 ) goto bBigger;
6260     if ( bSig1 < aSig1 ) goto aBigger;
6261     if ( aSig1 < bSig1 ) goto bBigger;
6262     return packFloat128(status->float_rounding_mode == float_round_down,
6263                         0, 0, 0);
6264  bExpBigger:
6265     if ( bExp == 0x7FFF ) {
6266         if (bSig0 | bSig1) {
6267             return propagateFloat128NaN(a, b, status);
6268         }
6269         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6270     }
6271     if ( aExp == 0 ) {
6272         ++expDiff;
6273     }
6274     else {
6275         aSig0 |= LIT64( 0x4000000000000000 );
6276     }
6277     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6278     bSig0 |= LIT64( 0x4000000000000000 );
6279  bBigger:
6280     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6281     zExp = bExp;
6282     zSign ^= 1;
6283     goto normalizeRoundAndPack;
6284  aExpBigger:
6285     if ( aExp == 0x7FFF ) {
6286         if (aSig0 | aSig1) {
6287             return propagateFloat128NaN(a, b, status);
6288         }
6289         return a;
6290     }
6291     if ( bExp == 0 ) {
6292         --expDiff;
6293     }
6294     else {
6295         bSig0 |= LIT64( 0x4000000000000000 );
6296     }
6297     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6298     aSig0 |= LIT64( 0x4000000000000000 );
6299  aBigger:
6300     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6301     zExp = aExp;
6302  normalizeRoundAndPack:
6303     --zExp;
6304     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6305                                          status);
6306 
6307 }
6308 
6309 /*----------------------------------------------------------------------------
6310 | Returns the result of adding the quadruple-precision floating-point values
6311 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6312 | for Binary Floating-Point Arithmetic.
6313 *----------------------------------------------------------------------------*/
6314 
6315 float128 float128_add(float128 a, float128 b, float_status *status)
6316 {
6317     flag aSign, bSign;
6318 
6319     aSign = extractFloat128Sign( a );
6320     bSign = extractFloat128Sign( b );
6321     if ( aSign == bSign ) {
6322         return addFloat128Sigs(a, b, aSign, status);
6323     }
6324     else {
6325         return subFloat128Sigs(a, b, aSign, status);
6326     }
6327 
6328 }
6329 
6330 /*----------------------------------------------------------------------------
6331 | Returns the result of subtracting the quadruple-precision floating-point
6332 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6333 | Standard for Binary Floating-Point Arithmetic.
6334 *----------------------------------------------------------------------------*/
6335 
6336 float128 float128_sub(float128 a, float128 b, float_status *status)
6337 {
6338     flag aSign, bSign;
6339 
6340     aSign = extractFloat128Sign( a );
6341     bSign = extractFloat128Sign( b );
6342     if ( aSign == bSign ) {
6343         return subFloat128Sigs(a, b, aSign, status);
6344     }
6345     else {
6346         return addFloat128Sigs(a, b, aSign, status);
6347     }
6348 
6349 }
6350 
6351 /*----------------------------------------------------------------------------
6352 | Returns the result of multiplying the quadruple-precision floating-point
6353 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6354 | Standard for Binary Floating-Point Arithmetic.
6355 *----------------------------------------------------------------------------*/
6356 
6357 float128 float128_mul(float128 a, float128 b, float_status *status)
6358 {
6359     flag aSign, bSign, zSign;
6360     int32_t aExp, bExp, zExp;
6361     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6362 
6363     aSig1 = extractFloat128Frac1( a );
6364     aSig0 = extractFloat128Frac0( a );
6365     aExp = extractFloat128Exp( a );
6366     aSign = extractFloat128Sign( a );
6367     bSig1 = extractFloat128Frac1( b );
6368     bSig0 = extractFloat128Frac0( b );
6369     bExp = extractFloat128Exp( b );
6370     bSign = extractFloat128Sign( b );
6371     zSign = aSign ^ bSign;
6372     if ( aExp == 0x7FFF ) {
6373         if (    ( aSig0 | aSig1 )
6374              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6375             return propagateFloat128NaN(a, b, status);
6376         }
6377         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6378         return packFloat128( zSign, 0x7FFF, 0, 0 );
6379     }
6380     if ( bExp == 0x7FFF ) {
6381         if (bSig0 | bSig1) {
6382             return propagateFloat128NaN(a, b, status);
6383         }
6384         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6385  invalid:
6386             float_raise(float_flag_invalid, status);
6387             return float128_default_nan(status);
6388         }
6389         return packFloat128( zSign, 0x7FFF, 0, 0 );
6390     }
6391     if ( aExp == 0 ) {
6392         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6393         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6394     }
6395     if ( bExp == 0 ) {
6396         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6397         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6398     }
6399     zExp = aExp + bExp - 0x4000;
6400     aSig0 |= LIT64( 0x0001000000000000 );
6401     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6402     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6403     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6404     zSig2 |= ( zSig3 != 0 );
6405     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6406         shift128ExtraRightJamming(
6407             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6408         ++zExp;
6409     }
6410     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6411 
6412 }
6413 
6414 /*----------------------------------------------------------------------------
6415 | Returns the result of dividing the quadruple-precision floating-point value
6416 | `a' by the corresponding value `b'.  The operation is performed according to
6417 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6418 *----------------------------------------------------------------------------*/
6419 
6420 float128 float128_div(float128 a, float128 b, float_status *status)
6421 {
6422     flag aSign, bSign, zSign;
6423     int32_t aExp, bExp, zExp;
6424     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6425     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6426 
6427     aSig1 = extractFloat128Frac1( a );
6428     aSig0 = extractFloat128Frac0( a );
6429     aExp = extractFloat128Exp( a );
6430     aSign = extractFloat128Sign( a );
6431     bSig1 = extractFloat128Frac1( b );
6432     bSig0 = extractFloat128Frac0( b );
6433     bExp = extractFloat128Exp( b );
6434     bSign = extractFloat128Sign( b );
6435     zSign = aSign ^ bSign;
6436     if ( aExp == 0x7FFF ) {
6437         if (aSig0 | aSig1) {
6438             return propagateFloat128NaN(a, b, status);
6439         }
6440         if ( bExp == 0x7FFF ) {
6441             if (bSig0 | bSig1) {
6442                 return propagateFloat128NaN(a, b, status);
6443             }
6444             goto invalid;
6445         }
6446         return packFloat128( zSign, 0x7FFF, 0, 0 );
6447     }
6448     if ( bExp == 0x7FFF ) {
6449         if (bSig0 | bSig1) {
6450             return propagateFloat128NaN(a, b, status);
6451         }
6452         return packFloat128( zSign, 0, 0, 0 );
6453     }
6454     if ( bExp == 0 ) {
6455         if ( ( bSig0 | bSig1 ) == 0 ) {
6456             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6457  invalid:
6458                 float_raise(float_flag_invalid, status);
6459                 return float128_default_nan(status);
6460             }
6461             float_raise(float_flag_divbyzero, status);
6462             return packFloat128( zSign, 0x7FFF, 0, 0 );
6463         }
6464         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6465     }
6466     if ( aExp == 0 ) {
6467         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6468         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6469     }
6470     zExp = aExp - bExp + 0x3FFD;
6471     shortShift128Left(
6472         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6473     shortShift128Left(
6474         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6475     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6476         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6477         ++zExp;
6478     }
6479     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6480     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6481     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6482     while ( (int64_t) rem0 < 0 ) {
6483         --zSig0;
6484         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6485     }
6486     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6487     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6488         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6489         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6490         while ( (int64_t) rem1 < 0 ) {
6491             --zSig1;
6492             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6493         }
6494         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6495     }
6496     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6497     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6498 
6499 }
6500 
6501 /*----------------------------------------------------------------------------
6502 | Returns the remainder of the quadruple-precision floating-point value `a'
6503 | with respect to the corresponding value `b'.  The operation is performed
6504 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6505 *----------------------------------------------------------------------------*/
6506 
6507 float128 float128_rem(float128 a, float128 b, float_status *status)
6508 {
6509     flag aSign, zSign;
6510     int32_t aExp, bExp, expDiff;
6511     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6512     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6513     int64_t sigMean0;
6514 
6515     aSig1 = extractFloat128Frac1( a );
6516     aSig0 = extractFloat128Frac0( a );
6517     aExp = extractFloat128Exp( a );
6518     aSign = extractFloat128Sign( a );
6519     bSig1 = extractFloat128Frac1( b );
6520     bSig0 = extractFloat128Frac0( b );
6521     bExp = extractFloat128Exp( b );
6522     if ( aExp == 0x7FFF ) {
6523         if (    ( aSig0 | aSig1 )
6524              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6525             return propagateFloat128NaN(a, b, status);
6526         }
6527         goto invalid;
6528     }
6529     if ( bExp == 0x7FFF ) {
6530         if (bSig0 | bSig1) {
6531             return propagateFloat128NaN(a, b, status);
6532         }
6533         return a;
6534     }
6535     if ( bExp == 0 ) {
6536         if ( ( bSig0 | bSig1 ) == 0 ) {
6537  invalid:
6538             float_raise(float_flag_invalid, status);
6539             return float128_default_nan(status);
6540         }
6541         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6542     }
6543     if ( aExp == 0 ) {
6544         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6545         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6546     }
6547     expDiff = aExp - bExp;
6548     if ( expDiff < -1 ) return a;
6549     shortShift128Left(
6550         aSig0 | LIT64( 0x0001000000000000 ),
6551         aSig1,
6552         15 - ( expDiff < 0 ),
6553         &aSig0,
6554         &aSig1
6555     );
6556     shortShift128Left(
6557         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6558     q = le128( bSig0, bSig1, aSig0, aSig1 );
6559     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6560     expDiff -= 64;
6561     while ( 0 < expDiff ) {
6562         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563         q = ( 4 < q ) ? q - 4 : 0;
6564         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6565         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6566         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6567         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6568         expDiff -= 61;
6569     }
6570     if ( -64 < expDiff ) {
6571         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6572         q = ( 4 < q ) ? q - 4 : 0;
6573         q >>= - expDiff;
6574         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6575         expDiff += 52;
6576         if ( expDiff < 0 ) {
6577             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6578         }
6579         else {
6580             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6581         }
6582         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6583         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6584     }
6585     else {
6586         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6587         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6588     }
6589     do {
6590         alternateASig0 = aSig0;
6591         alternateASig1 = aSig1;
6592         ++q;
6593         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6594     } while ( 0 <= (int64_t) aSig0 );
6595     add128(
6596         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6597     if (    ( sigMean0 < 0 )
6598          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6599         aSig0 = alternateASig0;
6600         aSig1 = alternateASig1;
6601     }
6602     zSign = ( (int64_t) aSig0 < 0 );
6603     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6604     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6605                                          status);
6606 }
6607 
6608 /*----------------------------------------------------------------------------
6609 | Returns the square root of the quadruple-precision floating-point value `a'.
6610 | The operation is performed according to the IEC/IEEE Standard for Binary
6611 | Floating-Point Arithmetic.
6612 *----------------------------------------------------------------------------*/
6613 
6614 float128 float128_sqrt(float128 a, float_status *status)
6615 {
6616     flag aSign;
6617     int32_t aExp, zExp;
6618     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6619     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6620 
6621     aSig1 = extractFloat128Frac1( a );
6622     aSig0 = extractFloat128Frac0( a );
6623     aExp = extractFloat128Exp( a );
6624     aSign = extractFloat128Sign( a );
6625     if ( aExp == 0x7FFF ) {
6626         if (aSig0 | aSig1) {
6627             return propagateFloat128NaN(a, a, status);
6628         }
6629         if ( ! aSign ) return a;
6630         goto invalid;
6631     }
6632     if ( aSign ) {
6633         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6634  invalid:
6635         float_raise(float_flag_invalid, status);
6636         return float128_default_nan(status);
6637     }
6638     if ( aExp == 0 ) {
6639         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6640         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6641     }
6642     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6643     aSig0 |= LIT64( 0x0001000000000000 );
6644     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6645     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6646     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6647     doubleZSig0 = zSig0<<1;
6648     mul64To128( zSig0, zSig0, &term0, &term1 );
6649     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6650     while ( (int64_t) rem0 < 0 ) {
6651         --zSig0;
6652         doubleZSig0 -= 2;
6653         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6654     }
6655     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6656     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6657         if ( zSig1 == 0 ) zSig1 = 1;
6658         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6659         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6660         mul64To128( zSig1, zSig1, &term2, &term3 );
6661         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6662         while ( (int64_t) rem1 < 0 ) {
6663             --zSig1;
6664             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6665             term3 |= 1;
6666             term2 |= doubleZSig0;
6667             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6668         }
6669         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6670     }
6671     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6672     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6673 
6674 }
6675 
6676 /*----------------------------------------------------------------------------
6677 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6678 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6679 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6681 *----------------------------------------------------------------------------*/
6682 
6683 int float128_eq(float128 a, float128 b, float_status *status)
6684 {
6685 
6686     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6687               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6688          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6689               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6690        ) {
6691         float_raise(float_flag_invalid, status);
6692         return 0;
6693     }
6694     return
6695            ( a.low == b.low )
6696         && (    ( a.high == b.high )
6697              || (    ( a.low == 0 )
6698                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6699            );
6700 
6701 }
6702 
6703 /*----------------------------------------------------------------------------
6704 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6705 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6706 | exception is raised if either operand is a NaN.  The comparison is performed
6707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6708 *----------------------------------------------------------------------------*/
6709 
6710 int float128_le(float128 a, float128 b, float_status *status)
6711 {
6712     flag aSign, bSign;
6713 
6714     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6715               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6716          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6717               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6718        ) {
6719         float_raise(float_flag_invalid, status);
6720         return 0;
6721     }
6722     aSign = extractFloat128Sign( a );
6723     bSign = extractFloat128Sign( b );
6724     if ( aSign != bSign ) {
6725         return
6726                aSign
6727             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6728                  == 0 );
6729     }
6730     return
6731           aSign ? le128( b.high, b.low, a.high, a.low )
6732         : le128( a.high, a.low, b.high, b.low );
6733 
6734 }
6735 
6736 /*----------------------------------------------------------------------------
6737 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6738 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6739 | raised if either operand is a NaN.  The comparison is performed according
6740 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6741 *----------------------------------------------------------------------------*/
6742 
6743 int float128_lt(float128 a, float128 b, float_status *status)
6744 {
6745     flag aSign, bSign;
6746 
6747     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6748               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6749          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6750               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6751        ) {
6752         float_raise(float_flag_invalid, status);
6753         return 0;
6754     }
6755     aSign = extractFloat128Sign( a );
6756     bSign = extractFloat128Sign( b );
6757     if ( aSign != bSign ) {
6758         return
6759                aSign
6760             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6761                  != 0 );
6762     }
6763     return
6764           aSign ? lt128( b.high, b.low, a.high, a.low )
6765         : lt128( a.high, a.low, b.high, b.low );
6766 
6767 }
6768 
6769 /*----------------------------------------------------------------------------
6770 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6771 | be compared, and 0 otherwise.  The invalid exception is raised if either
6772 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6773 | Standard for Binary Floating-Point Arithmetic.
6774 *----------------------------------------------------------------------------*/
6775 
6776 int float128_unordered(float128 a, float128 b, float_status *status)
6777 {
6778     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6779               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6780          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6781               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6782        ) {
6783         float_raise(float_flag_invalid, status);
6784         return 1;
6785     }
6786     return 0;
6787 }
6788 
6789 /*----------------------------------------------------------------------------
6790 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6791 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6792 | exception.  The comparison is performed according to the IEC/IEEE Standard
6793 | for Binary Floating-Point Arithmetic.
6794 *----------------------------------------------------------------------------*/
6795 
6796 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6797 {
6798 
6799     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6800               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6801          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6802               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6803        ) {
6804         if (float128_is_signaling_nan(a, status)
6805          || float128_is_signaling_nan(b, status)) {
6806             float_raise(float_flag_invalid, status);
6807         }
6808         return 0;
6809     }
6810     return
6811            ( a.low == b.low )
6812         && (    ( a.high == b.high )
6813              || (    ( a.low == 0 )
6814                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6815            );
6816 
6817 }
6818 
6819 /*----------------------------------------------------------------------------
6820 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6821 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6822 | cause an exception.  Otherwise, the comparison is performed according to the
6823 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6824 *----------------------------------------------------------------------------*/
6825 
6826 int float128_le_quiet(float128 a, float128 b, float_status *status)
6827 {
6828     flag aSign, bSign;
6829 
6830     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6831               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6832          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6833               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6834        ) {
6835         if (float128_is_signaling_nan(a, status)
6836          || float128_is_signaling_nan(b, status)) {
6837             float_raise(float_flag_invalid, status);
6838         }
6839         return 0;
6840     }
6841     aSign = extractFloat128Sign( a );
6842     bSign = extractFloat128Sign( b );
6843     if ( aSign != bSign ) {
6844         return
6845                aSign
6846             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6847                  == 0 );
6848     }
6849     return
6850           aSign ? le128( b.high, b.low, a.high, a.low )
6851         : le128( a.high, a.low, b.high, b.low );
6852 
6853 }
6854 
6855 /*----------------------------------------------------------------------------
6856 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6857 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6858 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6859 | Standard for Binary Floating-Point Arithmetic.
6860 *----------------------------------------------------------------------------*/
6861 
6862 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6863 {
6864     flag aSign, bSign;
6865 
6866     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6867               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6868          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6869               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6870        ) {
6871         if (float128_is_signaling_nan(a, status)
6872          || float128_is_signaling_nan(b, status)) {
6873             float_raise(float_flag_invalid, status);
6874         }
6875         return 0;
6876     }
6877     aSign = extractFloat128Sign( a );
6878     bSign = extractFloat128Sign( b );
6879     if ( aSign != bSign ) {
6880         return
6881                aSign
6882             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6883                  != 0 );
6884     }
6885     return
6886           aSign ? lt128( b.high, b.low, a.high, a.low )
6887         : lt128( a.high, a.low, b.high, b.low );
6888 
6889 }
6890 
6891 /*----------------------------------------------------------------------------
6892 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6893 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6894 | comparison is performed according to the IEC/IEEE Standard for Binary
6895 | Floating-Point Arithmetic.
6896 *----------------------------------------------------------------------------*/
6897 
6898 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6899 {
6900     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6901               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6902          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6903               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6904        ) {
6905         if (float128_is_signaling_nan(a, status)
6906          || float128_is_signaling_nan(b, status)) {
6907             float_raise(float_flag_invalid, status);
6908         }
6909         return 1;
6910     }
6911     return 0;
6912 }
6913 
6914 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6915                                             int is_quiet, float_status *status)
6916 {
6917     flag aSign, bSign;
6918 
6919     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6920         float_raise(float_flag_invalid, status);
6921         return float_relation_unordered;
6922     }
6923     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6924           ( extractFloatx80Frac( a )<<1 ) ) ||
6925         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6926           ( extractFloatx80Frac( b )<<1 ) )) {
6927         if (!is_quiet ||
6928             floatx80_is_signaling_nan(a, status) ||
6929             floatx80_is_signaling_nan(b, status)) {
6930             float_raise(float_flag_invalid, status);
6931         }
6932         return float_relation_unordered;
6933     }
6934     aSign = extractFloatx80Sign( a );
6935     bSign = extractFloatx80Sign( b );
6936     if ( aSign != bSign ) {
6937 
6938         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6939              ( ( a.low | b.low ) == 0 ) ) {
6940             /* zero case */
6941             return float_relation_equal;
6942         } else {
6943             return 1 - (2 * aSign);
6944         }
6945     } else {
6946         if (a.low == b.low && a.high == b.high) {
6947             return float_relation_equal;
6948         } else {
6949             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6950         }
6951     }
6952 }
6953 
6954 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6955 {
6956     return floatx80_compare_internal(a, b, 0, status);
6957 }
6958 
6959 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6960 {
6961     return floatx80_compare_internal(a, b, 1, status);
6962 }
6963 
6964 static inline int float128_compare_internal(float128 a, float128 b,
6965                                             int is_quiet, float_status *status)
6966 {
6967     flag aSign, bSign;
6968 
6969     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6970           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6971         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6972           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6973         if (!is_quiet ||
6974             float128_is_signaling_nan(a, status) ||
6975             float128_is_signaling_nan(b, status)) {
6976             float_raise(float_flag_invalid, status);
6977         }
6978         return float_relation_unordered;
6979     }
6980     aSign = extractFloat128Sign( a );
6981     bSign = extractFloat128Sign( b );
6982     if ( aSign != bSign ) {
6983         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6984             /* zero case */
6985             return float_relation_equal;
6986         } else {
6987             return 1 - (2 * aSign);
6988         }
6989     } else {
6990         if (a.low == b.low && a.high == b.high) {
6991             return float_relation_equal;
6992         } else {
6993             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6994         }
6995     }
6996 }
6997 
6998 int float128_compare(float128 a, float128 b, float_status *status)
6999 {
7000     return float128_compare_internal(a, b, 0, status);
7001 }
7002 
7003 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7004 {
7005     return float128_compare_internal(a, b, 1, status);
7006 }
7007 
7008 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7009 {
7010     flag aSign;
7011     int32_t aExp;
7012     uint64_t aSig;
7013 
7014     if (floatx80_invalid_encoding(a)) {
7015         float_raise(float_flag_invalid, status);
7016         return floatx80_default_nan(status);
7017     }
7018     aSig = extractFloatx80Frac( a );
7019     aExp = extractFloatx80Exp( a );
7020     aSign = extractFloatx80Sign( a );
7021 
7022     if ( aExp == 0x7FFF ) {
7023         if ( aSig<<1 ) {
7024             return propagateFloatx80NaN(a, a, status);
7025         }
7026         return a;
7027     }
7028 
7029     if (aExp == 0) {
7030         if (aSig == 0) {
7031             return a;
7032         }
7033         aExp++;
7034     }
7035 
7036     if (n > 0x10000) {
7037         n = 0x10000;
7038     } else if (n < -0x10000) {
7039         n = -0x10000;
7040     }
7041 
7042     aExp += n;
7043     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7044                                          aSign, aExp, aSig, 0, status);
7045 }
7046 
7047 float128 float128_scalbn(float128 a, int n, float_status *status)
7048 {
7049     flag aSign;
7050     int32_t aExp;
7051     uint64_t aSig0, aSig1;
7052 
7053     aSig1 = extractFloat128Frac1( a );
7054     aSig0 = extractFloat128Frac0( a );
7055     aExp = extractFloat128Exp( a );
7056     aSign = extractFloat128Sign( a );
7057     if ( aExp == 0x7FFF ) {
7058         if ( aSig0 | aSig1 ) {
7059             return propagateFloat128NaN(a, a, status);
7060         }
7061         return a;
7062     }
7063     if (aExp != 0) {
7064         aSig0 |= LIT64( 0x0001000000000000 );
7065     } else if (aSig0 == 0 && aSig1 == 0) {
7066         return a;
7067     } else {
7068         aExp++;
7069     }
7070 
7071     if (n > 0x10000) {
7072         n = 0x10000;
7073     } else if (n < -0x10000) {
7074         n = -0x10000;
7075     }
7076 
7077     aExp += n - 1;
7078     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7079                                          , status);
7080 
7081 }
7082