xref: /openbmc/qemu/fpu/softfloat.c (revision 5321fa68)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
101 
102 static inline uint32_t extractFloat16Frac(float16 a)
103 {
104     return float16_val(a) & 0x3ff;
105 }
106 
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110 
111 static inline int extractFloat16Exp(float16 a)
112 {
113     return (float16_val(a) >> 10) & 0x1f;
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline uint32_t extractFloat32Frac(float32 a)
121 {
122     return float32_val(a) & 0x007FFFFF;
123 }
124 
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128 
129 static inline int extractFloat32Exp(float32 a)
130 {
131     return (float32_val(a) >> 23) & 0xFF;
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137 
138 static inline flag extractFloat32Sign(float32 a)
139 {
140     return float32_val(a) >> 31;
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146 
147 static inline uint64_t extractFloat64Frac(float64 a)
148 {
149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
150 }
151 
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155 
156 static inline int extractFloat64Exp(float64 a)
157 {
158     return (float64_val(a) >> 52) & 0x7FF;
159 }
160 
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164 
165 static inline flag extractFloat64Sign(float64 a)
166 {
167     return float64_val(a) >> 63;
168 }
169 
170 /*
171  * Classify a floating point number. Everything above float_class_qnan
172  * is a NaN so cls >= float_class_qnan is any NaN.
173  */
174 
175 typedef enum __attribute__ ((__packed__)) {
176     float_class_unclassified,
177     float_class_zero,
178     float_class_normal,
179     float_class_inf,
180     float_class_qnan,  /* all NaNs from here */
181     float_class_snan,
182 } FloatClass;
183 
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
186 {
187     return unlikely(c >= float_class_qnan);
188 }
189 
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
191 {
192     return c == float_class_snan;
193 }
194 
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
196 {
197     return c == float_class_qnan;
198 }
199 
200 /*
201  * Structure holding all of the decomposed parts of a float. The
202  * exponent is unbiased and the fraction is normalized. All
203  * calculations are done with a 64 bit fraction and then rounded as
204  * appropriate for the final format.
205  *
206  * Thanks to the packed FloatClass a decent compiler should be able to
207  * fit the whole structure into registers and avoid using the stack
208  * for parameter passing.
209  */
210 
211 typedef struct {
212     uint64_t frac;
213     int32_t  exp;
214     FloatClass cls;
215     bool sign;
216 } FloatParts;
217 
218 #define DECOMPOSED_BINARY_POINT    (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
221 
222 /* Structure holding all of the relevant parameters for a format.
223  *   exp_size: the size of the exponent field
224  *   exp_bias: the offset applied to the exponent field
225  *   exp_max: the maximum normalised exponent
226  *   frac_size: the size of the fraction field
227  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228  * The following are computed based the size of fraction
229  *   frac_lsb: least significant bit of fraction
230  *   frac_lsbm1: the bit below the least significant bit (for rounding)
231  *   round_mask/roundeven_mask: masks used for rounding
232  * The following optional modifiers are available:
233  *   arm_althp: handle ARM Alternative Half Precision
234  */
235 typedef struct {
236     int exp_size;
237     int exp_bias;
238     int exp_max;
239     int frac_size;
240     int frac_shift;
241     uint64_t frac_lsb;
242     uint64_t frac_lsbm1;
243     uint64_t round_mask;
244     uint64_t roundeven_mask;
245     bool arm_althp;
246 } FloatFmt;
247 
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F)                                           \
250     .exp_size       = E,                                             \
251     .exp_bias       = ((1 << E) - 1) >> 1,                           \
252     .exp_max        = (1 << E) - 1,                                  \
253     .frac_size      = F,                                             \
254     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
255     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
256     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
257     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
258     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
259 
260 static const FloatFmt float16_params = {
261     FLOAT_PARAMS(5, 10)
262 };
263 
264 static const FloatFmt float16_params_ahp = {
265     FLOAT_PARAMS(5, 10),
266     .arm_althp = true
267 };
268 
269 static const FloatFmt float32_params = {
270     FLOAT_PARAMS(8, 23)
271 };
272 
273 static const FloatFmt float64_params = {
274     FLOAT_PARAMS(11, 52)
275 };
276 
277 /* Unpack a float to parts, but do not canonicalize.  */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
279 {
280     const int sign_pos = fmt.frac_size + fmt.exp_size;
281 
282     return (FloatParts) {
283         .cls = float_class_unclassified,
284         .sign = extract64(raw, sign_pos, 1),
285         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286         .frac = extract64(raw, 0, fmt.frac_size),
287     };
288 }
289 
290 static inline FloatParts float16_unpack_raw(float16 f)
291 {
292     return unpack_raw(float16_params, f);
293 }
294 
295 static inline FloatParts float32_unpack_raw(float32 f)
296 {
297     return unpack_raw(float32_params, f);
298 }
299 
300 static inline FloatParts float64_unpack_raw(float64 f)
301 {
302     return unpack_raw(float64_params, f);
303 }
304 
305 /* Pack a float from parts, but do not canonicalize.  */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
307 {
308     const int sign_pos = fmt.frac_size + fmt.exp_size;
309     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310     return deposit64(ret, sign_pos, 1, p.sign);
311 }
312 
313 static inline float16 float16_pack_raw(FloatParts p)
314 {
315     return make_float16(pack_raw(float16_params, p));
316 }
317 
318 static inline float32 float32_pack_raw(FloatParts p)
319 {
320     return make_float32(pack_raw(float32_params, p));
321 }
322 
323 static inline float64 float64_pack_raw(FloatParts p)
324 {
325     return make_float64(pack_raw(float64_params, p));
326 }
327 
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine:  (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output.  These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
337 
338 /* Canonicalize EXP and FRAC, setting CLS.  */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340                                float_status *status)
341 {
342     if (part.exp == parm->exp_max && !parm->arm_althp) {
343         if (part.frac == 0) {
344             part.cls = float_class_inf;
345         } else {
346             part.frac <<= parm->frac_shift;
347             part.cls = (parts_is_snan_frac(part.frac, status)
348                         ? float_class_snan : float_class_qnan);
349         }
350     } else if (part.exp == 0) {
351         if (likely(part.frac == 0)) {
352             part.cls = float_class_zero;
353         } else if (status->flush_inputs_to_zero) {
354             float_raise(float_flag_input_denormal, status);
355             part.cls = float_class_zero;
356             part.frac = 0;
357         } else {
358             int shift = clz64(part.frac) - 1;
359             part.cls = float_class_normal;
360             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361             part.frac <<= shift;
362         }
363     } else {
364         part.cls = float_class_normal;
365         part.exp -= parm->exp_bias;
366         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
367     }
368     return part;
369 }
370 
371 /* Round and uncanonicalize a floating-point number by parts. There
372  * are FRAC_SHIFT bits that may require rounding at the bottom of the
373  * fraction; these bits will be removed. The exponent will be biased
374  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
375  */
376 
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378                                   const FloatFmt *parm)
379 {
380     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381     const uint64_t round_mask = parm->round_mask;
382     const uint64_t roundeven_mask = parm->roundeven_mask;
383     const int exp_max = parm->exp_max;
384     const int frac_shift = parm->frac_shift;
385     uint64_t frac, inc;
386     int exp, flags = 0;
387     bool overflow_norm;
388 
389     frac = p.frac;
390     exp = p.exp;
391 
392     switch (p.cls) {
393     case float_class_normal:
394         switch (s->float_rounding_mode) {
395         case float_round_nearest_even:
396             overflow_norm = false;
397             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398             break;
399         case float_round_ties_away:
400             overflow_norm = false;
401             inc = frac_lsbm1;
402             break;
403         case float_round_to_zero:
404             overflow_norm = true;
405             inc = 0;
406             break;
407         case float_round_up:
408             inc = p.sign ? 0 : round_mask;
409             overflow_norm = p.sign;
410             break;
411         case float_round_down:
412             inc = p.sign ? round_mask : 0;
413             overflow_norm = !p.sign;
414             break;
415         default:
416             g_assert_not_reached();
417         }
418 
419         exp += parm->exp_bias;
420         if (likely(exp > 0)) {
421             if (frac & round_mask) {
422                 flags |= float_flag_inexact;
423                 frac += inc;
424                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425                     frac >>= 1;
426                     exp++;
427                 }
428             }
429             frac >>= frac_shift;
430 
431             if (parm->arm_althp) {
432                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
433                 if (unlikely(exp > exp_max)) {
434                     /* Overflow.  Return the maximum normal.  */
435                     flags = float_flag_invalid;
436                     exp = exp_max;
437                     frac = -1;
438                 }
439             } else if (unlikely(exp >= exp_max)) {
440                 flags |= float_flag_overflow | float_flag_inexact;
441                 if (overflow_norm) {
442                     exp = exp_max - 1;
443                     frac = -1;
444                 } else {
445                     p.cls = float_class_inf;
446                     goto do_inf;
447                 }
448             }
449         } else if (s->flush_to_zero) {
450             flags |= float_flag_output_denormal;
451             p.cls = float_class_zero;
452             goto do_zero;
453         } else {
454             bool is_tiny = (s->float_detect_tininess
455                             == float_tininess_before_rounding)
456                         || (exp < 0)
457                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
458 
459             shift64RightJamming(frac, 1 - exp, &frac);
460             if (frac & round_mask) {
461                 /* Need to recompute round-to-even.  */
462                 if (s->float_rounding_mode == float_round_nearest_even) {
463                     inc = ((frac & roundeven_mask) != frac_lsbm1
464                            ? frac_lsbm1 : 0);
465                 }
466                 flags |= float_flag_inexact;
467                 frac += inc;
468             }
469 
470             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471             frac >>= frac_shift;
472 
473             if (is_tiny && (flags & float_flag_inexact)) {
474                 flags |= float_flag_underflow;
475             }
476             if (exp == 0 && frac == 0) {
477                 p.cls = float_class_zero;
478             }
479         }
480         break;
481 
482     case float_class_zero:
483     do_zero:
484         exp = 0;
485         frac = 0;
486         break;
487 
488     case float_class_inf:
489     do_inf:
490         assert(!parm->arm_althp);
491         exp = exp_max;
492         frac = 0;
493         break;
494 
495     case float_class_qnan:
496     case float_class_snan:
497         assert(!parm->arm_althp);
498         exp = exp_max;
499         frac >>= parm->frac_shift;
500         break;
501 
502     default:
503         g_assert_not_reached();
504     }
505 
506     float_raise(flags, s);
507     p.exp = exp;
508     p.frac = frac;
509     return p;
510 }
511 
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514                                             const FloatFmt *params)
515 {
516     return canonicalize(float16_unpack_raw(f), params, s);
517 }
518 
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
520 {
521     return float16a_unpack_canonical(f, s, &float16_params);
522 }
523 
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525                                              const FloatFmt *params)
526 {
527     return float16_pack_raw(round_canonical(p, s, params));
528 }
529 
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
531 {
532     return float16a_round_pack_canonical(p, s, &float16_params);
533 }
534 
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
536 {
537     return canonicalize(float32_unpack_raw(f), &float32_params, s);
538 }
539 
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
541 {
542     return float32_pack_raw(round_canonical(p, s, &float32_params));
543 }
544 
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
546 {
547     return canonicalize(float64_unpack_raw(f), &float64_params, s);
548 }
549 
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
551 {
552     return float64_pack_raw(round_canonical(p, s, &float64_params));
553 }
554 
555 static FloatParts return_nan(FloatParts a, float_status *s)
556 {
557     switch (a.cls) {
558     case float_class_snan:
559         s->float_exception_flags |= float_flag_invalid;
560         a = parts_silence_nan(a, s);
561         /* fall through */
562     case float_class_qnan:
563         if (s->default_nan_mode) {
564             return parts_default_nan(s);
565         }
566         break;
567 
568     default:
569         g_assert_not_reached();
570     }
571     return a;
572 }
573 
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
575 {
576     if (is_snan(a.cls) || is_snan(b.cls)) {
577         s->float_exception_flags |= float_flag_invalid;
578     }
579 
580     if (s->default_nan_mode) {
581         return parts_default_nan(s);
582     } else {
583         if (pickNaN(a.cls, b.cls,
584                     a.frac > b.frac ||
585                     (a.frac == b.frac && a.sign < b.sign))) {
586             a = b;
587         }
588         if (is_snan(a.cls)) {
589             return parts_silence_nan(a, s);
590         }
591     }
592     return a;
593 }
594 
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596                                   bool inf_zero, float_status *s)
597 {
598     int which;
599 
600     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601         s->float_exception_flags |= float_flag_invalid;
602     }
603 
604     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
605 
606     if (s->default_nan_mode) {
607         /* Note that this check is after pickNaNMulAdd so that function
608          * has an opportunity to set the Invalid flag.
609          */
610         which = 3;
611     }
612 
613     switch (which) {
614     case 0:
615         break;
616     case 1:
617         a = b;
618         break;
619     case 2:
620         a = c;
621         break;
622     case 3:
623         return parts_default_nan(s);
624     default:
625         g_assert_not_reached();
626     }
627 
628     if (is_snan(a.cls)) {
629         return parts_silence_nan(a, s);
630     }
631     return a;
632 }
633 
634 /*
635  * Returns the result of adding or subtracting the values of the
636  * floating-point values `a' and `b'. The operation is performed
637  * according to the IEC/IEEE Standard for Binary Floating-Point
638  * Arithmetic.
639  */
640 
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642                                 float_status *s)
643 {
644     bool a_sign = a.sign;
645     bool b_sign = b.sign ^ subtract;
646 
647     if (a_sign != b_sign) {
648         /* Subtraction */
649 
650         if (a.cls == float_class_normal && b.cls == float_class_normal) {
651             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653                 a.frac = a.frac - b.frac;
654             } else {
655                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656                 a.frac = b.frac - a.frac;
657                 a.exp = b.exp;
658                 a_sign ^= 1;
659             }
660 
661             if (a.frac == 0) {
662                 a.cls = float_class_zero;
663                 a.sign = s->float_rounding_mode == float_round_down;
664             } else {
665                 int shift = clz64(a.frac) - 1;
666                 a.frac = a.frac << shift;
667                 a.exp = a.exp - shift;
668                 a.sign = a_sign;
669             }
670             return a;
671         }
672         if (is_nan(a.cls) || is_nan(b.cls)) {
673             return pick_nan(a, b, s);
674         }
675         if (a.cls == float_class_inf) {
676             if (b.cls == float_class_inf) {
677                 float_raise(float_flag_invalid, s);
678                 return parts_default_nan(s);
679             }
680             return a;
681         }
682         if (a.cls == float_class_zero && b.cls == float_class_zero) {
683             a.sign = s->float_rounding_mode == float_round_down;
684             return a;
685         }
686         if (a.cls == float_class_zero || b.cls == float_class_inf) {
687             b.sign = a_sign ^ 1;
688             return b;
689         }
690         if (b.cls == float_class_zero) {
691             return a;
692         }
693     } else {
694         /* Addition */
695         if (a.cls == float_class_normal && b.cls == float_class_normal) {
696             if (a.exp > b.exp) {
697                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698             } else if (a.exp < b.exp) {
699                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700                 a.exp = b.exp;
701             }
702             a.frac += b.frac;
703             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704                 shift64RightJamming(a.frac, 1, &a.frac);
705                 a.exp += 1;
706             }
707             return a;
708         }
709         if (is_nan(a.cls) || is_nan(b.cls)) {
710             return pick_nan(a, b, s);
711         }
712         if (a.cls == float_class_inf || b.cls == float_class_zero) {
713             return a;
714         }
715         if (b.cls == float_class_inf || a.cls == float_class_zero) {
716             b.sign = b_sign;
717             return b;
718         }
719     }
720     g_assert_not_reached();
721 }
722 
723 /*
724  * Returns the result of adding or subtracting the floating-point
725  * values `a' and `b'. The operation is performed according to the
726  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
727  */
728 
729 float16  __attribute__((flatten)) float16_add(float16 a, float16 b,
730                                               float_status *status)
731 {
732     FloatParts pa = float16_unpack_canonical(a, status);
733     FloatParts pb = float16_unpack_canonical(b, status);
734     FloatParts pr = addsub_floats(pa, pb, false, status);
735 
736     return float16_round_pack_canonical(pr, status);
737 }
738 
739 float32 __attribute__((flatten)) float32_add(float32 a, float32 b,
740                                              float_status *status)
741 {
742     FloatParts pa = float32_unpack_canonical(a, status);
743     FloatParts pb = float32_unpack_canonical(b, status);
744     FloatParts pr = addsub_floats(pa, pb, false, status);
745 
746     return float32_round_pack_canonical(pr, status);
747 }
748 
749 float64 __attribute__((flatten)) float64_add(float64 a, float64 b,
750                                              float_status *status)
751 {
752     FloatParts pa = float64_unpack_canonical(a, status);
753     FloatParts pb = float64_unpack_canonical(b, status);
754     FloatParts pr = addsub_floats(pa, pb, false, status);
755 
756     return float64_round_pack_canonical(pr, status);
757 }
758 
759 float16 __attribute__((flatten)) float16_sub(float16 a, float16 b,
760                                              float_status *status)
761 {
762     FloatParts pa = float16_unpack_canonical(a, status);
763     FloatParts pb = float16_unpack_canonical(b, status);
764     FloatParts pr = addsub_floats(pa, pb, true, status);
765 
766     return float16_round_pack_canonical(pr, status);
767 }
768 
769 float32 __attribute__((flatten)) float32_sub(float32 a, float32 b,
770                                              float_status *status)
771 {
772     FloatParts pa = float32_unpack_canonical(a, status);
773     FloatParts pb = float32_unpack_canonical(b, status);
774     FloatParts pr = addsub_floats(pa, pb, true, status);
775 
776     return float32_round_pack_canonical(pr, status);
777 }
778 
779 float64 __attribute__((flatten)) float64_sub(float64 a, float64 b,
780                                              float_status *status)
781 {
782     FloatParts pa = float64_unpack_canonical(a, status);
783     FloatParts pb = float64_unpack_canonical(b, status);
784     FloatParts pr = addsub_floats(pa, pb, true, status);
785 
786     return float64_round_pack_canonical(pr, status);
787 }
788 
789 /*
790  * Returns the result of multiplying the floating-point values `a' and
791  * `b'. The operation is performed according to the IEC/IEEE Standard
792  * for Binary Floating-Point Arithmetic.
793  */
794 
795 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
796 {
797     bool sign = a.sign ^ b.sign;
798 
799     if (a.cls == float_class_normal && b.cls == float_class_normal) {
800         uint64_t hi, lo;
801         int exp = a.exp + b.exp;
802 
803         mul64To128(a.frac, b.frac, &hi, &lo);
804         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
805         if (lo & DECOMPOSED_OVERFLOW_BIT) {
806             shift64RightJamming(lo, 1, &lo);
807             exp += 1;
808         }
809 
810         /* Re-use a */
811         a.exp = exp;
812         a.sign = sign;
813         a.frac = lo;
814         return a;
815     }
816     /* handle all the NaN cases */
817     if (is_nan(a.cls) || is_nan(b.cls)) {
818         return pick_nan(a, b, s);
819     }
820     /* Inf * Zero == NaN */
821     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
822         (a.cls == float_class_zero && b.cls == float_class_inf)) {
823         s->float_exception_flags |= float_flag_invalid;
824         return parts_default_nan(s);
825     }
826     /* Multiply by 0 or Inf */
827     if (a.cls == float_class_inf || a.cls == float_class_zero) {
828         a.sign = sign;
829         return a;
830     }
831     if (b.cls == float_class_inf || b.cls == float_class_zero) {
832         b.sign = sign;
833         return b;
834     }
835     g_assert_not_reached();
836 }
837 
838 float16 __attribute__((flatten)) float16_mul(float16 a, float16 b,
839                                              float_status *status)
840 {
841     FloatParts pa = float16_unpack_canonical(a, status);
842     FloatParts pb = float16_unpack_canonical(b, status);
843     FloatParts pr = mul_floats(pa, pb, status);
844 
845     return float16_round_pack_canonical(pr, status);
846 }
847 
848 float32 __attribute__((flatten)) float32_mul(float32 a, float32 b,
849                                              float_status *status)
850 {
851     FloatParts pa = float32_unpack_canonical(a, status);
852     FloatParts pb = float32_unpack_canonical(b, status);
853     FloatParts pr = mul_floats(pa, pb, status);
854 
855     return float32_round_pack_canonical(pr, status);
856 }
857 
858 float64 __attribute__((flatten)) float64_mul(float64 a, float64 b,
859                                              float_status *status)
860 {
861     FloatParts pa = float64_unpack_canonical(a, status);
862     FloatParts pb = float64_unpack_canonical(b, status);
863     FloatParts pr = mul_floats(pa, pb, status);
864 
865     return float64_round_pack_canonical(pr, status);
866 }
867 
868 /*
869  * Returns the result of multiplying the floating-point values `a' and
870  * `b' then adding 'c', with no intermediate rounding step after the
871  * multiplication. The operation is performed according to the
872  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
873  * The flags argument allows the caller to select negation of the
874  * addend, the intermediate product, or the final result. (The
875  * difference between this and having the caller do a separate
876  * negation is that negating externally will flip the sign bit on
877  * NaNs.)
878  */
879 
880 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
881                                 int flags, float_status *s)
882 {
883     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
884                     ((1 << float_class_inf) | (1 << float_class_zero));
885     bool p_sign;
886     bool sign_flip = flags & float_muladd_negate_result;
887     FloatClass p_class;
888     uint64_t hi, lo;
889     int p_exp;
890 
891     /* It is implementation-defined whether the cases of (0,inf,qnan)
892      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
893      * they return if they do), so we have to hand this information
894      * off to the target-specific pick-a-NaN routine.
895      */
896     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
897         return pick_nan_muladd(a, b, c, inf_zero, s);
898     }
899 
900     if (inf_zero) {
901         s->float_exception_flags |= float_flag_invalid;
902         return parts_default_nan(s);
903     }
904 
905     if (flags & float_muladd_negate_c) {
906         c.sign ^= 1;
907     }
908 
909     p_sign = a.sign ^ b.sign;
910 
911     if (flags & float_muladd_negate_product) {
912         p_sign ^= 1;
913     }
914 
915     if (a.cls == float_class_inf || b.cls == float_class_inf) {
916         p_class = float_class_inf;
917     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
918         p_class = float_class_zero;
919     } else {
920         p_class = float_class_normal;
921     }
922 
923     if (c.cls == float_class_inf) {
924         if (p_class == float_class_inf && p_sign != c.sign) {
925             s->float_exception_flags |= float_flag_invalid;
926             return parts_default_nan(s);
927         } else {
928             a.cls = float_class_inf;
929             a.sign = c.sign ^ sign_flip;
930             return a;
931         }
932     }
933 
934     if (p_class == float_class_inf) {
935         a.cls = float_class_inf;
936         a.sign = p_sign ^ sign_flip;
937         return a;
938     }
939 
940     if (p_class == float_class_zero) {
941         if (c.cls == float_class_zero) {
942             if (p_sign != c.sign) {
943                 p_sign = s->float_rounding_mode == float_round_down;
944             }
945             c.sign = p_sign;
946         } else if (flags & float_muladd_halve_result) {
947             c.exp -= 1;
948         }
949         c.sign ^= sign_flip;
950         return c;
951     }
952 
953     /* a & b should be normals now... */
954     assert(a.cls == float_class_normal &&
955            b.cls == float_class_normal);
956 
957     p_exp = a.exp + b.exp;
958 
959     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
960      * result.
961      */
962     mul64To128(a.frac, b.frac, &hi, &lo);
963     /* binary point now at bit 124 */
964 
965     /* check for overflow */
966     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
967         shift128RightJamming(hi, lo, 1, &hi, &lo);
968         p_exp += 1;
969     }
970 
971     /* + add/sub */
972     if (c.cls == float_class_zero) {
973         /* move binary point back to 62 */
974         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
975     } else {
976         int exp_diff = p_exp - c.exp;
977         if (p_sign == c.sign) {
978             /* Addition */
979             if (exp_diff <= 0) {
980                 shift128RightJamming(hi, lo,
981                                      DECOMPOSED_BINARY_POINT - exp_diff,
982                                      &hi, &lo);
983                 lo += c.frac;
984                 p_exp = c.exp;
985             } else {
986                 uint64_t c_hi, c_lo;
987                 /* shift c to the same binary point as the product (124) */
988                 c_hi = c.frac >> 2;
989                 c_lo = 0;
990                 shift128RightJamming(c_hi, c_lo,
991                                      exp_diff,
992                                      &c_hi, &c_lo);
993                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
994                 /* move binary point back to 62 */
995                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
996             }
997 
998             if (lo & DECOMPOSED_OVERFLOW_BIT) {
999                 shift64RightJamming(lo, 1, &lo);
1000                 p_exp += 1;
1001             }
1002 
1003         } else {
1004             /* Subtraction */
1005             uint64_t c_hi, c_lo;
1006             /* make C binary point match product at bit 124 */
1007             c_hi = c.frac >> 2;
1008             c_lo = 0;
1009 
1010             if (exp_diff <= 0) {
1011                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1012                 if (exp_diff == 0
1013                     &&
1014                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1015                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1016                 } else {
1017                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1018                     p_sign ^= 1;
1019                     p_exp = c.exp;
1020                 }
1021             } else {
1022                 shift128RightJamming(c_hi, c_lo,
1023                                      exp_diff,
1024                                      &c_hi, &c_lo);
1025                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1026             }
1027 
1028             if (hi == 0 && lo == 0) {
1029                 a.cls = float_class_zero;
1030                 a.sign = s->float_rounding_mode == float_round_down;
1031                 a.sign ^= sign_flip;
1032                 return a;
1033             } else {
1034                 int shift;
1035                 if (hi != 0) {
1036                     shift = clz64(hi);
1037                 } else {
1038                     shift = clz64(lo) + 64;
1039                 }
1040                 /* Normalizing to a binary point of 124 is the
1041                    correct adjust for the exponent.  However since we're
1042                    shifting, we might as well put the binary point back
1043                    at 62 where we really want it.  Therefore shift as
1044                    if we're leaving 1 bit at the top of the word, but
1045                    adjust the exponent as if we're leaving 3 bits.  */
1046                 shift -= 1;
1047                 if (shift >= 64) {
1048                     lo = lo << (shift - 64);
1049                 } else {
1050                     hi = (hi << shift) | (lo >> (64 - shift));
1051                     lo = hi | ((lo << shift) != 0);
1052                 }
1053                 p_exp -= shift - 2;
1054             }
1055         }
1056     }
1057 
1058     if (flags & float_muladd_halve_result) {
1059         p_exp -= 1;
1060     }
1061 
1062     /* finally prepare our result */
1063     a.cls = float_class_normal;
1064     a.sign = p_sign ^ sign_flip;
1065     a.exp = p_exp;
1066     a.frac = lo;
1067 
1068     return a;
1069 }
1070 
1071 float16 __attribute__((flatten)) float16_muladd(float16 a, float16 b, float16 c,
1072                                                 int flags, float_status *status)
1073 {
1074     FloatParts pa = float16_unpack_canonical(a, status);
1075     FloatParts pb = float16_unpack_canonical(b, status);
1076     FloatParts pc = float16_unpack_canonical(c, status);
1077     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1078 
1079     return float16_round_pack_canonical(pr, status);
1080 }
1081 
1082 float32 __attribute__((flatten)) float32_muladd(float32 a, float32 b, float32 c,
1083                                                 int flags, float_status *status)
1084 {
1085     FloatParts pa = float32_unpack_canonical(a, status);
1086     FloatParts pb = float32_unpack_canonical(b, status);
1087     FloatParts pc = float32_unpack_canonical(c, status);
1088     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1089 
1090     return float32_round_pack_canonical(pr, status);
1091 }
1092 
1093 float64 __attribute__((flatten)) float64_muladd(float64 a, float64 b, float64 c,
1094                                                 int flags, float_status *status)
1095 {
1096     FloatParts pa = float64_unpack_canonical(a, status);
1097     FloatParts pb = float64_unpack_canonical(b, status);
1098     FloatParts pc = float64_unpack_canonical(c, status);
1099     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1100 
1101     return float64_round_pack_canonical(pr, status);
1102 }
1103 
1104 /*
1105  * Returns the result of dividing the floating-point value `a' by the
1106  * corresponding value `b'. The operation is performed according to
1107  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1108  */
1109 
1110 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1111 {
1112     bool sign = a.sign ^ b.sign;
1113 
1114     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1115         uint64_t temp_lo, temp_hi;
1116         int exp = a.exp - b.exp;
1117         if (a.frac < b.frac) {
1118             exp -= 1;
1119             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1,
1120                               &temp_hi, &temp_lo);
1121         } else {
1122             shortShift128Left(0, a.frac, DECOMPOSED_BINARY_POINT,
1123                               &temp_hi, &temp_lo);
1124         }
1125         /* LSB of quot is set if inexact which roundandpack will use
1126          * to set flags. Yet again we re-use a for the result */
1127         a.frac = div128To64(temp_lo, temp_hi, b.frac);
1128         a.sign = sign;
1129         a.exp = exp;
1130         return a;
1131     }
1132     /* handle all the NaN cases */
1133     if (is_nan(a.cls) || is_nan(b.cls)) {
1134         return pick_nan(a, b, s);
1135     }
1136     /* 0/0 or Inf/Inf */
1137     if (a.cls == b.cls
1138         &&
1139         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1140         s->float_exception_flags |= float_flag_invalid;
1141         return parts_default_nan(s);
1142     }
1143     /* Inf / x or 0 / x */
1144     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1145         a.sign = sign;
1146         return a;
1147     }
1148     /* Div 0 => Inf */
1149     if (b.cls == float_class_zero) {
1150         s->float_exception_flags |= float_flag_divbyzero;
1151         a.cls = float_class_inf;
1152         a.sign = sign;
1153         return a;
1154     }
1155     /* Div by Inf */
1156     if (b.cls == float_class_inf) {
1157         a.cls = float_class_zero;
1158         a.sign = sign;
1159         return a;
1160     }
1161     g_assert_not_reached();
1162 }
1163 
1164 float16 float16_div(float16 a, float16 b, float_status *status)
1165 {
1166     FloatParts pa = float16_unpack_canonical(a, status);
1167     FloatParts pb = float16_unpack_canonical(b, status);
1168     FloatParts pr = div_floats(pa, pb, status);
1169 
1170     return float16_round_pack_canonical(pr, status);
1171 }
1172 
1173 float32 float32_div(float32 a, float32 b, float_status *status)
1174 {
1175     FloatParts pa = float32_unpack_canonical(a, status);
1176     FloatParts pb = float32_unpack_canonical(b, status);
1177     FloatParts pr = div_floats(pa, pb, status);
1178 
1179     return float32_round_pack_canonical(pr, status);
1180 }
1181 
1182 float64 float64_div(float64 a, float64 b, float_status *status)
1183 {
1184     FloatParts pa = float64_unpack_canonical(a, status);
1185     FloatParts pb = float64_unpack_canonical(b, status);
1186     FloatParts pr = div_floats(pa, pb, status);
1187 
1188     return float64_round_pack_canonical(pr, status);
1189 }
1190 
1191 /*
1192  * Float to Float conversions
1193  *
1194  * Returns the result of converting one float format to another. The
1195  * conversion is performed according to the IEC/IEEE Standard for
1196  * Binary Floating-Point Arithmetic.
1197  *
1198  * The float_to_float helper only needs to take care of raising
1199  * invalid exceptions and handling the conversion on NaNs.
1200  */
1201 
1202 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1203                                  float_status *s)
1204 {
1205     if (dstf->arm_althp) {
1206         switch (a.cls) {
1207         case float_class_qnan:
1208         case float_class_snan:
1209             /* There is no NaN in the destination format.  Raise Invalid
1210              * and return a zero with the sign of the input NaN.
1211              */
1212             s->float_exception_flags |= float_flag_invalid;
1213             a.cls = float_class_zero;
1214             a.frac = 0;
1215             a.exp = 0;
1216             break;
1217 
1218         case float_class_inf:
1219             /* There is no Inf in the destination format.  Raise Invalid
1220              * and return the maximum normal with the correct sign.
1221              */
1222             s->float_exception_flags |= float_flag_invalid;
1223             a.cls = float_class_normal;
1224             a.exp = dstf->exp_max;
1225             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1226             break;
1227 
1228         default:
1229             break;
1230         }
1231     } else if (is_nan(a.cls)) {
1232         if (is_snan(a.cls)) {
1233             s->float_exception_flags |= float_flag_invalid;
1234             a = parts_silence_nan(a, s);
1235         }
1236         if (s->default_nan_mode) {
1237             return parts_default_nan(s);
1238         }
1239     }
1240     return a;
1241 }
1242 
1243 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1244 {
1245     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1246     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1247     FloatParts pr = float_to_float(p, &float32_params, s);
1248     return float32_round_pack_canonical(pr, s);
1249 }
1250 
1251 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1252 {
1253     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1254     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1255     FloatParts pr = float_to_float(p, &float64_params, s);
1256     return float64_round_pack_canonical(pr, s);
1257 }
1258 
1259 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1260 {
1261     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1262     FloatParts p = float32_unpack_canonical(a, s);
1263     FloatParts pr = float_to_float(p, fmt16, s);
1264     return float16a_round_pack_canonical(pr, s, fmt16);
1265 }
1266 
1267 float64 float32_to_float64(float32 a, float_status *s)
1268 {
1269     FloatParts p = float32_unpack_canonical(a, s);
1270     FloatParts pr = float_to_float(p, &float64_params, s);
1271     return float64_round_pack_canonical(pr, s);
1272 }
1273 
1274 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1275 {
1276     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1277     FloatParts p = float64_unpack_canonical(a, s);
1278     FloatParts pr = float_to_float(p, fmt16, s);
1279     return float16a_round_pack_canonical(pr, s, fmt16);
1280 }
1281 
1282 float32 float64_to_float32(float64 a, float_status *s)
1283 {
1284     FloatParts p = float64_unpack_canonical(a, s);
1285     FloatParts pr = float_to_float(p, &float32_params, s);
1286     return float32_round_pack_canonical(pr, s);
1287 }
1288 
1289 /*
1290  * Rounds the floating-point value `a' to an integer, and returns the
1291  * result as a floating-point value. The operation is performed
1292  * according to the IEC/IEEE Standard for Binary Floating-Point
1293  * Arithmetic.
1294  */
1295 
1296 static FloatParts round_to_int(FloatParts a, int rounding_mode, float_status *s)
1297 {
1298     if (is_nan(a.cls)) {
1299         return return_nan(a, s);
1300     }
1301 
1302     switch (a.cls) {
1303     case float_class_zero:
1304     case float_class_inf:
1305     case float_class_qnan:
1306         /* already "integral" */
1307         break;
1308     case float_class_normal:
1309         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1310             /* already integral */
1311             break;
1312         }
1313         if (a.exp < 0) {
1314             bool one;
1315             /* all fractional */
1316             s->float_exception_flags |= float_flag_inexact;
1317             switch (rounding_mode) {
1318             case float_round_nearest_even:
1319                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1320                 break;
1321             case float_round_ties_away:
1322                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1323                 break;
1324             case float_round_to_zero:
1325                 one = false;
1326                 break;
1327             case float_round_up:
1328                 one = !a.sign;
1329                 break;
1330             case float_round_down:
1331                 one = a.sign;
1332                 break;
1333             default:
1334                 g_assert_not_reached();
1335             }
1336 
1337             if (one) {
1338                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1339                 a.exp = 0;
1340             } else {
1341                 a.cls = float_class_zero;
1342             }
1343         } else {
1344             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1345             uint64_t frac_lsbm1 = frac_lsb >> 1;
1346             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1347             uint64_t rnd_mask = rnd_even_mask >> 1;
1348             uint64_t inc;
1349 
1350             switch (rounding_mode) {
1351             case float_round_nearest_even:
1352                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1353                 break;
1354             case float_round_ties_away:
1355                 inc = frac_lsbm1;
1356                 break;
1357             case float_round_to_zero:
1358                 inc = 0;
1359                 break;
1360             case float_round_up:
1361                 inc = a.sign ? 0 : rnd_mask;
1362                 break;
1363             case float_round_down:
1364                 inc = a.sign ? rnd_mask : 0;
1365                 break;
1366             default:
1367                 g_assert_not_reached();
1368             }
1369 
1370             if (a.frac & rnd_mask) {
1371                 s->float_exception_flags |= float_flag_inexact;
1372                 a.frac += inc;
1373                 a.frac &= ~rnd_mask;
1374                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1375                     a.frac >>= 1;
1376                     a.exp++;
1377                 }
1378             }
1379         }
1380         break;
1381     default:
1382         g_assert_not_reached();
1383     }
1384     return a;
1385 }
1386 
1387 float16 float16_round_to_int(float16 a, float_status *s)
1388 {
1389     FloatParts pa = float16_unpack_canonical(a, s);
1390     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1391     return float16_round_pack_canonical(pr, s);
1392 }
1393 
1394 float32 float32_round_to_int(float32 a, float_status *s)
1395 {
1396     FloatParts pa = float32_unpack_canonical(a, s);
1397     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1398     return float32_round_pack_canonical(pr, s);
1399 }
1400 
1401 float64 float64_round_to_int(float64 a, float_status *s)
1402 {
1403     FloatParts pa = float64_unpack_canonical(a, s);
1404     FloatParts pr = round_to_int(pa, s->float_rounding_mode, s);
1405     return float64_round_pack_canonical(pr, s);
1406 }
1407 
1408 float64 float64_trunc_to_int(float64 a, float_status *s)
1409 {
1410     FloatParts pa = float64_unpack_canonical(a, s);
1411     FloatParts pr = round_to_int(pa, float_round_to_zero, s);
1412     return float64_round_pack_canonical(pr, s);
1413 }
1414 
1415 /*
1416  * Returns the result of converting the floating-point value `a' to
1417  * the two's complement integer format. The conversion is performed
1418  * according to the IEC/IEEE Standard for Binary Floating-Point
1419  * Arithmetic---which means in particular that the conversion is
1420  * rounded according to the current rounding mode. If `a' is a NaN,
1421  * the largest positive integer is returned. Otherwise, if the
1422  * conversion overflows, the largest integer with the same sign as `a'
1423  * is returned.
1424 */
1425 
1426 static int64_t round_to_int_and_pack(FloatParts in, int rmode,
1427                                      int64_t min, int64_t max,
1428                                      float_status *s)
1429 {
1430     uint64_t r;
1431     int orig_flags = get_float_exception_flags(s);
1432     FloatParts p = round_to_int(in, rmode, s);
1433 
1434     switch (p.cls) {
1435     case float_class_snan:
1436     case float_class_qnan:
1437         s->float_exception_flags = orig_flags | float_flag_invalid;
1438         return max;
1439     case float_class_inf:
1440         s->float_exception_flags = orig_flags | float_flag_invalid;
1441         return p.sign ? min : max;
1442     case float_class_zero:
1443         return 0;
1444     case float_class_normal:
1445         if (p.exp < DECOMPOSED_BINARY_POINT) {
1446             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1447         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1448             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1449         } else {
1450             r = UINT64_MAX;
1451         }
1452         if (p.sign) {
1453             if (r <= -(uint64_t) min) {
1454                 return -r;
1455             } else {
1456                 s->float_exception_flags = orig_flags | float_flag_invalid;
1457                 return min;
1458             }
1459         } else {
1460             if (r <= max) {
1461                 return r;
1462             } else {
1463                 s->float_exception_flags = orig_flags | float_flag_invalid;
1464                 return max;
1465             }
1466         }
1467     default:
1468         g_assert_not_reached();
1469     }
1470 }
1471 
1472 #define FLOAT_TO_INT(fsz, isz)                                          \
1473 int ## isz ## _t float ## fsz ## _to_int ## isz(float ## fsz a,         \
1474                                                 float_status *s)        \
1475 {                                                                       \
1476     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1477     return round_to_int_and_pack(p, s->float_rounding_mode,             \
1478                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1479                                  s);                                    \
1480 }                                                                       \
1481                                                                         \
1482 int ## isz ## _t float ## fsz ## _to_int ## isz ## _round_to_zero       \
1483  (float ## fsz a, float_status *s)                                      \
1484 {                                                                       \
1485     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1486     return round_to_int_and_pack(p, float_round_to_zero,                \
1487                                  INT ## isz ## _MIN, INT ## isz ## _MAX,\
1488                                  s);                                    \
1489 }
1490 
1491 FLOAT_TO_INT(16, 16)
1492 FLOAT_TO_INT(16, 32)
1493 FLOAT_TO_INT(16, 64)
1494 
1495 FLOAT_TO_INT(32, 16)
1496 FLOAT_TO_INT(32, 32)
1497 FLOAT_TO_INT(32, 64)
1498 
1499 FLOAT_TO_INT(64, 16)
1500 FLOAT_TO_INT(64, 32)
1501 FLOAT_TO_INT(64, 64)
1502 
1503 #undef FLOAT_TO_INT
1504 
1505 /*
1506  *  Returns the result of converting the floating-point value `a' to
1507  *  the unsigned integer format. The conversion is performed according
1508  *  to the IEC/IEEE Standard for Binary Floating-Point
1509  *  Arithmetic---which means in particular that the conversion is
1510  *  rounded according to the current rounding mode. If `a' is a NaN,
1511  *  the largest unsigned integer is returned. Otherwise, if the
1512  *  conversion overflows, the largest unsigned integer is returned. If
1513  *  the 'a' is negative, the result is rounded and zero is returned;
1514  *  values that do not round to zero will raise the inexact exception
1515  *  flag.
1516  */
1517 
1518 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, uint64_t max,
1519                                        float_status *s)
1520 {
1521     int orig_flags = get_float_exception_flags(s);
1522     FloatParts p = round_to_int(in, rmode, s);
1523 
1524     switch (p.cls) {
1525     case float_class_snan:
1526     case float_class_qnan:
1527         s->float_exception_flags = orig_flags | float_flag_invalid;
1528         return max;
1529     case float_class_inf:
1530         s->float_exception_flags = orig_flags | float_flag_invalid;
1531         return p.sign ? 0 : max;
1532     case float_class_zero:
1533         return 0;
1534     case float_class_normal:
1535     {
1536         uint64_t r;
1537         if (p.sign) {
1538             s->float_exception_flags = orig_flags | float_flag_invalid;
1539             return 0;
1540         }
1541 
1542         if (p.exp < DECOMPOSED_BINARY_POINT) {
1543             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1544         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1545             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1546         } else {
1547             s->float_exception_flags = orig_flags | float_flag_invalid;
1548             return max;
1549         }
1550 
1551         /* For uint64 this will never trip, but if p.exp is too large
1552          * to shift a decomposed fraction we shall have exited via the
1553          * 3rd leg above.
1554          */
1555         if (r > max) {
1556             s->float_exception_flags = orig_flags | float_flag_invalid;
1557             return max;
1558         } else {
1559             return r;
1560         }
1561     }
1562     default:
1563         g_assert_not_reached();
1564     }
1565 }
1566 
1567 #define FLOAT_TO_UINT(fsz, isz) \
1568 uint ## isz ## _t float ## fsz ## _to_uint ## isz(float ## fsz a,       \
1569                                                   float_status *s)      \
1570 {                                                                       \
1571     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1572     return round_to_uint_and_pack(p, s->float_rounding_mode,            \
1573                                  UINT ## isz ## _MAX, s);               \
1574 }                                                                       \
1575                                                                         \
1576 uint ## isz ## _t float ## fsz ## _to_uint ## isz ## _round_to_zero     \
1577  (float ## fsz a, float_status *s)                                      \
1578 {                                                                       \
1579     FloatParts p = float ## fsz ## _unpack_canonical(a, s);             \
1580     return round_to_uint_and_pack(p, float_round_to_zero,               \
1581                                   UINT ## isz ## _MAX, s);              \
1582 }
1583 
1584 FLOAT_TO_UINT(16, 16)
1585 FLOAT_TO_UINT(16, 32)
1586 FLOAT_TO_UINT(16, 64)
1587 
1588 FLOAT_TO_UINT(32, 16)
1589 FLOAT_TO_UINT(32, 32)
1590 FLOAT_TO_UINT(32, 64)
1591 
1592 FLOAT_TO_UINT(64, 16)
1593 FLOAT_TO_UINT(64, 32)
1594 FLOAT_TO_UINT(64, 64)
1595 
1596 #undef FLOAT_TO_UINT
1597 
1598 /*
1599  * Integer to float conversions
1600  *
1601  * Returns the result of converting the two's complement integer `a'
1602  * to the floating-point format. The conversion is performed according
1603  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1604  */
1605 
1606 static FloatParts int_to_float(int64_t a, float_status *status)
1607 {
1608     FloatParts r = {};
1609     if (a == 0) {
1610         r.cls = float_class_zero;
1611         r.sign = false;
1612     } else if (a == (1ULL << 63)) {
1613         r.cls = float_class_normal;
1614         r.sign = true;
1615         r.frac = DECOMPOSED_IMPLICIT_BIT;
1616         r.exp = 63;
1617     } else {
1618         uint64_t f;
1619         if (a < 0) {
1620             f = -a;
1621             r.sign = true;
1622         } else {
1623             f = a;
1624             r.sign = false;
1625         }
1626         int shift = clz64(f) - 1;
1627         r.cls = float_class_normal;
1628         r.exp = (DECOMPOSED_BINARY_POINT - shift);
1629         r.frac = f << shift;
1630     }
1631 
1632     return r;
1633 }
1634 
1635 float16 int64_to_float16(int64_t a, float_status *status)
1636 {
1637     FloatParts pa = int_to_float(a, status);
1638     return float16_round_pack_canonical(pa, status);
1639 }
1640 
1641 float16 int32_to_float16(int32_t a, float_status *status)
1642 {
1643     return int64_to_float16(a, status);
1644 }
1645 
1646 float16 int16_to_float16(int16_t a, float_status *status)
1647 {
1648     return int64_to_float16(a, status);
1649 }
1650 
1651 float32 int64_to_float32(int64_t a, float_status *status)
1652 {
1653     FloatParts pa = int_to_float(a, status);
1654     return float32_round_pack_canonical(pa, status);
1655 }
1656 
1657 float32 int32_to_float32(int32_t a, float_status *status)
1658 {
1659     return int64_to_float32(a, status);
1660 }
1661 
1662 float32 int16_to_float32(int16_t a, float_status *status)
1663 {
1664     return int64_to_float32(a, status);
1665 }
1666 
1667 float64 int64_to_float64(int64_t a, float_status *status)
1668 {
1669     FloatParts pa = int_to_float(a, status);
1670     return float64_round_pack_canonical(pa, status);
1671 }
1672 
1673 float64 int32_to_float64(int32_t a, float_status *status)
1674 {
1675     return int64_to_float64(a, status);
1676 }
1677 
1678 float64 int16_to_float64(int16_t a, float_status *status)
1679 {
1680     return int64_to_float64(a, status);
1681 }
1682 
1683 
1684 /*
1685  * Unsigned Integer to float conversions
1686  *
1687  * Returns the result of converting the unsigned integer `a' to the
1688  * floating-point format. The conversion is performed according to the
1689  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1690  */
1691 
1692 static FloatParts uint_to_float(uint64_t a, float_status *status)
1693 {
1694     FloatParts r = { .sign = false};
1695 
1696     if (a == 0) {
1697         r.cls = float_class_zero;
1698     } else {
1699         int spare_bits = clz64(a) - 1;
1700         r.cls = float_class_normal;
1701         r.exp = DECOMPOSED_BINARY_POINT - spare_bits;
1702         if (spare_bits < 0) {
1703             shift64RightJamming(a, -spare_bits, &a);
1704             r.frac = a;
1705         } else {
1706             r.frac = a << spare_bits;
1707         }
1708     }
1709 
1710     return r;
1711 }
1712 
1713 float16 uint64_to_float16(uint64_t a, float_status *status)
1714 {
1715     FloatParts pa = uint_to_float(a, status);
1716     return float16_round_pack_canonical(pa, status);
1717 }
1718 
1719 float16 uint32_to_float16(uint32_t a, float_status *status)
1720 {
1721     return uint64_to_float16(a, status);
1722 }
1723 
1724 float16 uint16_to_float16(uint16_t a, float_status *status)
1725 {
1726     return uint64_to_float16(a, status);
1727 }
1728 
1729 float32 uint64_to_float32(uint64_t a, float_status *status)
1730 {
1731     FloatParts pa = uint_to_float(a, status);
1732     return float32_round_pack_canonical(pa, status);
1733 }
1734 
1735 float32 uint32_to_float32(uint32_t a, float_status *status)
1736 {
1737     return uint64_to_float32(a, status);
1738 }
1739 
1740 float32 uint16_to_float32(uint16_t a, float_status *status)
1741 {
1742     return uint64_to_float32(a, status);
1743 }
1744 
1745 float64 uint64_to_float64(uint64_t a, float_status *status)
1746 {
1747     FloatParts pa = uint_to_float(a, status);
1748     return float64_round_pack_canonical(pa, status);
1749 }
1750 
1751 float64 uint32_to_float64(uint32_t a, float_status *status)
1752 {
1753     return uint64_to_float64(a, status);
1754 }
1755 
1756 float64 uint16_to_float64(uint16_t a, float_status *status)
1757 {
1758     return uint64_to_float64(a, status);
1759 }
1760 
1761 /* Float Min/Max */
1762 /* min() and max() functions. These can't be implemented as
1763  * 'compare and pick one input' because that would mishandle
1764  * NaNs and +0 vs -0.
1765  *
1766  * minnum() and maxnum() functions. These are similar to the min()
1767  * and max() functions but if one of the arguments is a QNaN and
1768  * the other is numerical then the numerical argument is returned.
1769  * SNaNs will get quietened before being returned.
1770  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
1771  * and maxNum() operations. min() and max() are the typical min/max
1772  * semantics provided by many CPUs which predate that specification.
1773  *
1774  * minnummag() and maxnummag() functions correspond to minNumMag()
1775  * and minNumMag() from the IEEE-754 2008.
1776  */
1777 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
1778                                 bool ieee, bool ismag, float_status *s)
1779 {
1780     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
1781         if (ieee) {
1782             /* Takes two floating-point values `a' and `b', one of
1783              * which is a NaN, and returns the appropriate NaN
1784              * result. If either `a' or `b' is a signaling NaN,
1785              * the invalid exception is raised.
1786              */
1787             if (is_snan(a.cls) || is_snan(b.cls)) {
1788                 return pick_nan(a, b, s);
1789             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
1790                 return b;
1791             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
1792                 return a;
1793             }
1794         }
1795         return pick_nan(a, b, s);
1796     } else {
1797         int a_exp, b_exp;
1798 
1799         switch (a.cls) {
1800         case float_class_normal:
1801             a_exp = a.exp;
1802             break;
1803         case float_class_inf:
1804             a_exp = INT_MAX;
1805             break;
1806         case float_class_zero:
1807             a_exp = INT_MIN;
1808             break;
1809         default:
1810             g_assert_not_reached();
1811             break;
1812         }
1813         switch (b.cls) {
1814         case float_class_normal:
1815             b_exp = b.exp;
1816             break;
1817         case float_class_inf:
1818             b_exp = INT_MAX;
1819             break;
1820         case float_class_zero:
1821             b_exp = INT_MIN;
1822             break;
1823         default:
1824             g_assert_not_reached();
1825             break;
1826         }
1827 
1828         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
1829             bool a_less = a_exp < b_exp;
1830             if (a_exp == b_exp) {
1831                 a_less = a.frac < b.frac;
1832             }
1833             return a_less ^ ismin ? b : a;
1834         }
1835 
1836         if (a.sign == b.sign) {
1837             bool a_less = a_exp < b_exp;
1838             if (a_exp == b_exp) {
1839                 a_less = a.frac < b.frac;
1840             }
1841             return a.sign ^ a_less ^ ismin ? b : a;
1842         } else {
1843             return a.sign ^ ismin ? b : a;
1844         }
1845     }
1846 }
1847 
1848 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
1849 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
1850                                      float_status *s)                   \
1851 {                                                                       \
1852     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1853     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1854     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
1855                                                                         \
1856     return float ## sz ## _round_pack_canonical(pr, s);                 \
1857 }
1858 
1859 MINMAX(16, min, true, false, false)
1860 MINMAX(16, minnum, true, true, false)
1861 MINMAX(16, minnummag, true, true, true)
1862 MINMAX(16, max, false, false, false)
1863 MINMAX(16, maxnum, false, true, false)
1864 MINMAX(16, maxnummag, false, true, true)
1865 
1866 MINMAX(32, min, true, false, false)
1867 MINMAX(32, minnum, true, true, false)
1868 MINMAX(32, minnummag, true, true, true)
1869 MINMAX(32, max, false, false, false)
1870 MINMAX(32, maxnum, false, true, false)
1871 MINMAX(32, maxnummag, false, true, true)
1872 
1873 MINMAX(64, min, true, false, false)
1874 MINMAX(64, minnum, true, true, false)
1875 MINMAX(64, minnummag, true, true, true)
1876 MINMAX(64, max, false, false, false)
1877 MINMAX(64, maxnum, false, true, false)
1878 MINMAX(64, maxnummag, false, true, true)
1879 
1880 #undef MINMAX
1881 
1882 /* Floating point compare */
1883 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
1884                           float_status *s)
1885 {
1886     if (is_nan(a.cls) || is_nan(b.cls)) {
1887         if (!is_quiet ||
1888             a.cls == float_class_snan ||
1889             b.cls == float_class_snan) {
1890             s->float_exception_flags |= float_flag_invalid;
1891         }
1892         return float_relation_unordered;
1893     }
1894 
1895     if (a.cls == float_class_zero) {
1896         if (b.cls == float_class_zero) {
1897             return float_relation_equal;
1898         }
1899         return b.sign ? float_relation_greater : float_relation_less;
1900     } else if (b.cls == float_class_zero) {
1901         return a.sign ? float_relation_less : float_relation_greater;
1902     }
1903 
1904     /* The only really important thing about infinity is its sign. If
1905      * both are infinities the sign marks the smallest of the two.
1906      */
1907     if (a.cls == float_class_inf) {
1908         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
1909             return float_relation_equal;
1910         }
1911         return a.sign ? float_relation_less : float_relation_greater;
1912     } else if (b.cls == float_class_inf) {
1913         return b.sign ? float_relation_greater : float_relation_less;
1914     }
1915 
1916     if (a.sign != b.sign) {
1917         return a.sign ? float_relation_less : float_relation_greater;
1918     }
1919 
1920     if (a.exp == b.exp) {
1921         if (a.frac == b.frac) {
1922             return float_relation_equal;
1923         }
1924         if (a.sign) {
1925             return a.frac > b.frac ?
1926                 float_relation_less : float_relation_greater;
1927         } else {
1928             return a.frac > b.frac ?
1929                 float_relation_greater : float_relation_less;
1930         }
1931     } else {
1932         if (a.sign) {
1933             return a.exp > b.exp ? float_relation_less : float_relation_greater;
1934         } else {
1935             return a.exp > b.exp ? float_relation_greater : float_relation_less;
1936         }
1937     }
1938 }
1939 
1940 #define COMPARE(sz)                                                     \
1941 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
1942                             float_status *s)                            \
1943 {                                                                       \
1944     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1945     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1946     return compare_floats(pa, pb, false, s);                            \
1947 }                                                                       \
1948 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
1949                                   float_status *s)                      \
1950 {                                                                       \
1951     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
1952     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
1953     return compare_floats(pa, pb, true, s);                             \
1954 }
1955 
1956 COMPARE(16)
1957 COMPARE(32)
1958 COMPARE(64)
1959 
1960 #undef COMPARE
1961 
1962 /* Multiply A by 2 raised to the power N.  */
1963 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
1964 {
1965     if (unlikely(is_nan(a.cls))) {
1966         return return_nan(a, s);
1967     }
1968     if (a.cls == float_class_normal) {
1969         /* The largest float type (even though not supported by FloatParts)
1970          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
1971          * still allows rounding to infinity, without allowing overflow
1972          * within the int32_t that backs FloatParts.exp.
1973          */
1974         n = MIN(MAX(n, -0x10000), 0x10000);
1975         a.exp += n;
1976     }
1977     return a;
1978 }
1979 
1980 float16 float16_scalbn(float16 a, int n, float_status *status)
1981 {
1982     FloatParts pa = float16_unpack_canonical(a, status);
1983     FloatParts pr = scalbn_decomposed(pa, n, status);
1984     return float16_round_pack_canonical(pr, status);
1985 }
1986 
1987 float32 float32_scalbn(float32 a, int n, float_status *status)
1988 {
1989     FloatParts pa = float32_unpack_canonical(a, status);
1990     FloatParts pr = scalbn_decomposed(pa, n, status);
1991     return float32_round_pack_canonical(pr, status);
1992 }
1993 
1994 float64 float64_scalbn(float64 a, int n, float_status *status)
1995 {
1996     FloatParts pa = float64_unpack_canonical(a, status);
1997     FloatParts pr = scalbn_decomposed(pa, n, status);
1998     return float64_round_pack_canonical(pr, status);
1999 }
2000 
2001 /*
2002  * Square Root
2003  *
2004  * The old softfloat code did an approximation step before zeroing in
2005  * on the final result. However for simpleness we just compute the
2006  * square root by iterating down from the implicit bit to enough extra
2007  * bits to ensure we get a correctly rounded result.
2008  *
2009  * This does mean however the calculation is slower than before,
2010  * especially for 64 bit floats.
2011  */
2012 
2013 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2014 {
2015     uint64_t a_frac, r_frac, s_frac;
2016     int bit, last_bit;
2017 
2018     if (is_nan(a.cls)) {
2019         return return_nan(a, s);
2020     }
2021     if (a.cls == float_class_zero) {
2022         return a;  /* sqrt(+-0) = +-0 */
2023     }
2024     if (a.sign) {
2025         s->float_exception_flags |= float_flag_invalid;
2026         return parts_default_nan(s);
2027     }
2028     if (a.cls == float_class_inf) {
2029         return a;  /* sqrt(+inf) = +inf */
2030     }
2031 
2032     assert(a.cls == float_class_normal);
2033 
2034     /* We need two overflow bits at the top. Adding room for that is a
2035      * right shift. If the exponent is odd, we can discard the low bit
2036      * by multiplying the fraction by 2; that's a left shift. Combine
2037      * those and we shift right if the exponent is even.
2038      */
2039     a_frac = a.frac;
2040     if (!(a.exp & 1)) {
2041         a_frac >>= 1;
2042     }
2043     a.exp >>= 1;
2044 
2045     /* Bit-by-bit computation of sqrt.  */
2046     r_frac = 0;
2047     s_frac = 0;
2048 
2049     /* Iterate from implicit bit down to the 3 extra bits to compute a
2050      * properly rounded result. Remember we've inserted one more bit
2051      * at the top, so these positions are one less.
2052      */
2053     bit = DECOMPOSED_BINARY_POINT - 1;
2054     last_bit = MAX(p->frac_shift - 4, 0);
2055     do {
2056         uint64_t q = 1ULL << bit;
2057         uint64_t t_frac = s_frac + q;
2058         if (t_frac <= a_frac) {
2059             s_frac = t_frac + q;
2060             a_frac -= t_frac;
2061             r_frac += q;
2062         }
2063         a_frac <<= 1;
2064     } while (--bit >= last_bit);
2065 
2066     /* Undo the right shift done above. If there is any remaining
2067      * fraction, the result is inexact. Set the sticky bit.
2068      */
2069     a.frac = (r_frac << 1) + (a_frac != 0);
2070 
2071     return a;
2072 }
2073 
2074 float16 __attribute__((flatten)) float16_sqrt(float16 a, float_status *status)
2075 {
2076     FloatParts pa = float16_unpack_canonical(a, status);
2077     FloatParts pr = sqrt_float(pa, status, &float16_params);
2078     return float16_round_pack_canonical(pr, status);
2079 }
2080 
2081 float32 __attribute__((flatten)) float32_sqrt(float32 a, float_status *status)
2082 {
2083     FloatParts pa = float32_unpack_canonical(a, status);
2084     FloatParts pr = sqrt_float(pa, status, &float32_params);
2085     return float32_round_pack_canonical(pr, status);
2086 }
2087 
2088 float64 __attribute__((flatten)) float64_sqrt(float64 a, float_status *status)
2089 {
2090     FloatParts pa = float64_unpack_canonical(a, status);
2091     FloatParts pr = sqrt_float(pa, status, &float64_params);
2092     return float64_round_pack_canonical(pr, status);
2093 }
2094 
2095 /*----------------------------------------------------------------------------
2096 | The pattern for a default generated NaN.
2097 *----------------------------------------------------------------------------*/
2098 
2099 float16 float16_default_nan(float_status *status)
2100 {
2101     FloatParts p = parts_default_nan(status);
2102     p.frac >>= float16_params.frac_shift;
2103     return float16_pack_raw(p);
2104 }
2105 
2106 float32 float32_default_nan(float_status *status)
2107 {
2108     FloatParts p = parts_default_nan(status);
2109     p.frac >>= float32_params.frac_shift;
2110     return float32_pack_raw(p);
2111 }
2112 
2113 float64 float64_default_nan(float_status *status)
2114 {
2115     FloatParts p = parts_default_nan(status);
2116     p.frac >>= float64_params.frac_shift;
2117     return float64_pack_raw(p);
2118 }
2119 
2120 float128 float128_default_nan(float_status *status)
2121 {
2122     FloatParts p = parts_default_nan(status);
2123     float128 r;
2124 
2125     /* Extrapolate from the choices made by parts_default_nan to fill
2126      * in the quad-floating format.  If the low bit is set, assume we
2127      * want to set all non-snan bits.
2128      */
2129     r.low = -(p.frac & 1);
2130     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2131     r.high |= LIT64(0x7FFF000000000000);
2132     r.high |= (uint64_t)p.sign << 63;
2133 
2134     return r;
2135 }
2136 
2137 /*----------------------------------------------------------------------------
2138 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2139 *----------------------------------------------------------------------------*/
2140 
2141 float16 float16_silence_nan(float16 a, float_status *status)
2142 {
2143     FloatParts p = float16_unpack_raw(a);
2144     p.frac <<= float16_params.frac_shift;
2145     p = parts_silence_nan(p, status);
2146     p.frac >>= float16_params.frac_shift;
2147     return float16_pack_raw(p);
2148 }
2149 
2150 float32 float32_silence_nan(float32 a, float_status *status)
2151 {
2152     FloatParts p = float32_unpack_raw(a);
2153     p.frac <<= float32_params.frac_shift;
2154     p = parts_silence_nan(p, status);
2155     p.frac >>= float32_params.frac_shift;
2156     return float32_pack_raw(p);
2157 }
2158 
2159 float64 float64_silence_nan(float64 a, float_status *status)
2160 {
2161     FloatParts p = float64_unpack_raw(a);
2162     p.frac <<= float64_params.frac_shift;
2163     p = parts_silence_nan(p, status);
2164     p.frac >>= float64_params.frac_shift;
2165     return float64_pack_raw(p);
2166 }
2167 
2168 /*----------------------------------------------------------------------------
2169 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2170 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2171 | input.  If `zSign' is 1, the input is negated before being converted to an
2172 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2173 | is simply rounded to an integer, with the inexact exception raised if the
2174 | input cannot be represented exactly as an integer.  However, if the fixed-
2175 | point input is too large, the invalid exception is raised and the largest
2176 | positive or negative integer is returned.
2177 *----------------------------------------------------------------------------*/
2178 
2179 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2180 {
2181     int8_t roundingMode;
2182     flag roundNearestEven;
2183     int8_t roundIncrement, roundBits;
2184     int32_t z;
2185 
2186     roundingMode = status->float_rounding_mode;
2187     roundNearestEven = ( roundingMode == float_round_nearest_even );
2188     switch (roundingMode) {
2189     case float_round_nearest_even:
2190     case float_round_ties_away:
2191         roundIncrement = 0x40;
2192         break;
2193     case float_round_to_zero:
2194         roundIncrement = 0;
2195         break;
2196     case float_round_up:
2197         roundIncrement = zSign ? 0 : 0x7f;
2198         break;
2199     case float_round_down:
2200         roundIncrement = zSign ? 0x7f : 0;
2201         break;
2202     default:
2203         abort();
2204     }
2205     roundBits = absZ & 0x7F;
2206     absZ = ( absZ + roundIncrement )>>7;
2207     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2208     z = absZ;
2209     if ( zSign ) z = - z;
2210     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2211         float_raise(float_flag_invalid, status);
2212         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2213     }
2214     if (roundBits) {
2215         status->float_exception_flags |= float_flag_inexact;
2216     }
2217     return z;
2218 
2219 }
2220 
2221 /*----------------------------------------------------------------------------
2222 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2223 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2224 | and returns the properly rounded 64-bit integer corresponding to the input.
2225 | If `zSign' is 1, the input is negated before being converted to an integer.
2226 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2227 | the inexact exception raised if the input cannot be represented exactly as
2228 | an integer.  However, if the fixed-point input is too large, the invalid
2229 | exception is raised and the largest positive or negative integer is
2230 | returned.
2231 *----------------------------------------------------------------------------*/
2232 
2233 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2234                                float_status *status)
2235 {
2236     int8_t roundingMode;
2237     flag roundNearestEven, increment;
2238     int64_t z;
2239 
2240     roundingMode = status->float_rounding_mode;
2241     roundNearestEven = ( roundingMode == float_round_nearest_even );
2242     switch (roundingMode) {
2243     case float_round_nearest_even:
2244     case float_round_ties_away:
2245         increment = ((int64_t) absZ1 < 0);
2246         break;
2247     case float_round_to_zero:
2248         increment = 0;
2249         break;
2250     case float_round_up:
2251         increment = !zSign && absZ1;
2252         break;
2253     case float_round_down:
2254         increment = zSign && absZ1;
2255         break;
2256     default:
2257         abort();
2258     }
2259     if ( increment ) {
2260         ++absZ0;
2261         if ( absZ0 == 0 ) goto overflow;
2262         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2263     }
2264     z = absZ0;
2265     if ( zSign ) z = - z;
2266     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2267  overflow:
2268         float_raise(float_flag_invalid, status);
2269         return
2270               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2271             : LIT64( 0x7FFFFFFFFFFFFFFF );
2272     }
2273     if (absZ1) {
2274         status->float_exception_flags |= float_flag_inexact;
2275     }
2276     return z;
2277 
2278 }
2279 
2280 /*----------------------------------------------------------------------------
2281 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2282 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2283 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2284 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2285 | with the inexact exception raised if the input cannot be represented exactly
2286 | as an integer.  However, if the fixed-point input is too large, the invalid
2287 | exception is raised and the largest unsigned integer is returned.
2288 *----------------------------------------------------------------------------*/
2289 
2290 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2291                                 uint64_t absZ1, float_status *status)
2292 {
2293     int8_t roundingMode;
2294     flag roundNearestEven, increment;
2295 
2296     roundingMode = status->float_rounding_mode;
2297     roundNearestEven = (roundingMode == float_round_nearest_even);
2298     switch (roundingMode) {
2299     case float_round_nearest_even:
2300     case float_round_ties_away:
2301         increment = ((int64_t)absZ1 < 0);
2302         break;
2303     case float_round_to_zero:
2304         increment = 0;
2305         break;
2306     case float_round_up:
2307         increment = !zSign && absZ1;
2308         break;
2309     case float_round_down:
2310         increment = zSign && absZ1;
2311         break;
2312     default:
2313         abort();
2314     }
2315     if (increment) {
2316         ++absZ0;
2317         if (absZ0 == 0) {
2318             float_raise(float_flag_invalid, status);
2319             return LIT64(0xFFFFFFFFFFFFFFFF);
2320         }
2321         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2322     }
2323 
2324     if (zSign && absZ0) {
2325         float_raise(float_flag_invalid, status);
2326         return 0;
2327     }
2328 
2329     if (absZ1) {
2330         status->float_exception_flags |= float_flag_inexact;
2331     }
2332     return absZ0;
2333 }
2334 
2335 /*----------------------------------------------------------------------------
2336 | If `a' is denormal and we are in flush-to-zero mode then set the
2337 | input-denormal exception and return zero. Otherwise just return the value.
2338 *----------------------------------------------------------------------------*/
2339 float32 float32_squash_input_denormal(float32 a, float_status *status)
2340 {
2341     if (status->flush_inputs_to_zero) {
2342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2343             float_raise(float_flag_input_denormal, status);
2344             return make_float32(float32_val(a) & 0x80000000);
2345         }
2346     }
2347     return a;
2348 }
2349 
2350 /*----------------------------------------------------------------------------
2351 | Normalizes the subnormal single-precision floating-point value represented
2352 | by the denormalized significand `aSig'.  The normalized exponent and
2353 | significand are stored at the locations pointed to by `zExpPtr' and
2354 | `zSigPtr', respectively.
2355 *----------------------------------------------------------------------------*/
2356 
2357 static void
2358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2359 {
2360     int8_t shiftCount;
2361 
2362     shiftCount = countLeadingZeros32( aSig ) - 8;
2363     *zSigPtr = aSig<<shiftCount;
2364     *zExpPtr = 1 - shiftCount;
2365 
2366 }
2367 
2368 /*----------------------------------------------------------------------------
2369 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2370 | and significand `zSig', and returns the proper single-precision floating-
2371 | point value corresponding to the abstract input.  Ordinarily, the abstract
2372 | value is simply rounded and packed into the single-precision format, with
2373 | the inexact exception raised if the abstract input cannot be represented
2374 | exactly.  However, if the abstract value is too large, the overflow and
2375 | inexact exceptions are raised and an infinity or maximal finite value is
2376 | returned.  If the abstract value is too small, the input value is rounded to
2377 | a subnormal number, and the underflow and inexact exceptions are raised if
2378 | the abstract input cannot be represented exactly as a subnormal single-
2379 | precision floating-point number.
2380 |     The input significand `zSig' has its binary point between bits 30
2381 | and 29, which is 7 bits to the left of the usual location.  This shifted
2382 | significand must be normalized or smaller.  If `zSig' is not normalized,
2383 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2384 | and it must not require rounding.  In the usual case that `zSig' is
2385 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2386 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2387 | Binary Floating-Point Arithmetic.
2388 *----------------------------------------------------------------------------*/
2389 
2390 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2391                                    float_status *status)
2392 {
2393     int8_t roundingMode;
2394     flag roundNearestEven;
2395     int8_t roundIncrement, roundBits;
2396     flag isTiny;
2397 
2398     roundingMode = status->float_rounding_mode;
2399     roundNearestEven = ( roundingMode == float_round_nearest_even );
2400     switch (roundingMode) {
2401     case float_round_nearest_even:
2402     case float_round_ties_away:
2403         roundIncrement = 0x40;
2404         break;
2405     case float_round_to_zero:
2406         roundIncrement = 0;
2407         break;
2408     case float_round_up:
2409         roundIncrement = zSign ? 0 : 0x7f;
2410         break;
2411     case float_round_down:
2412         roundIncrement = zSign ? 0x7f : 0;
2413         break;
2414     default:
2415         abort();
2416         break;
2417     }
2418     roundBits = zSig & 0x7F;
2419     if ( 0xFD <= (uint16_t) zExp ) {
2420         if (    ( 0xFD < zExp )
2421              || (    ( zExp == 0xFD )
2422                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2423            ) {
2424             float_raise(float_flag_overflow | float_flag_inexact, status);
2425             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2426         }
2427         if ( zExp < 0 ) {
2428             if (status->flush_to_zero) {
2429                 float_raise(float_flag_output_denormal, status);
2430                 return packFloat32(zSign, 0, 0);
2431             }
2432             isTiny =
2433                 (status->float_detect_tininess
2434                  == float_tininess_before_rounding)
2435                 || ( zExp < -1 )
2436                 || ( zSig + roundIncrement < 0x80000000 );
2437             shift32RightJamming( zSig, - zExp, &zSig );
2438             zExp = 0;
2439             roundBits = zSig & 0x7F;
2440             if (isTiny && roundBits) {
2441                 float_raise(float_flag_underflow, status);
2442             }
2443         }
2444     }
2445     if (roundBits) {
2446         status->float_exception_flags |= float_flag_inexact;
2447     }
2448     zSig = ( zSig + roundIncrement )>>7;
2449     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2450     if ( zSig == 0 ) zExp = 0;
2451     return packFloat32( zSign, zExp, zSig );
2452 
2453 }
2454 
2455 /*----------------------------------------------------------------------------
2456 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2457 | and significand `zSig', and returns the proper single-precision floating-
2458 | point value corresponding to the abstract input.  This routine is just like
2459 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2460 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2461 | floating-point exponent.
2462 *----------------------------------------------------------------------------*/
2463 
2464 static float32
2465  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2466                               float_status *status)
2467 {
2468     int8_t shiftCount;
2469 
2470     shiftCount = countLeadingZeros32( zSig ) - 1;
2471     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2472                                status);
2473 
2474 }
2475 
2476 /*----------------------------------------------------------------------------
2477 | If `a' is denormal and we are in flush-to-zero mode then set the
2478 | input-denormal exception and return zero. Otherwise just return the value.
2479 *----------------------------------------------------------------------------*/
2480 float64 float64_squash_input_denormal(float64 a, float_status *status)
2481 {
2482     if (status->flush_inputs_to_zero) {
2483         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2484             float_raise(float_flag_input_denormal, status);
2485             return make_float64(float64_val(a) & (1ULL << 63));
2486         }
2487     }
2488     return a;
2489 }
2490 
2491 /*----------------------------------------------------------------------------
2492 | Normalizes the subnormal double-precision floating-point value represented
2493 | by the denormalized significand `aSig'.  The normalized exponent and
2494 | significand are stored at the locations pointed to by `zExpPtr' and
2495 | `zSigPtr', respectively.
2496 *----------------------------------------------------------------------------*/
2497 
2498 static void
2499  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2500 {
2501     int8_t shiftCount;
2502 
2503     shiftCount = countLeadingZeros64( aSig ) - 11;
2504     *zSigPtr = aSig<<shiftCount;
2505     *zExpPtr = 1 - shiftCount;
2506 
2507 }
2508 
2509 /*----------------------------------------------------------------------------
2510 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2511 | double-precision floating-point value, returning the result.  After being
2512 | shifted into the proper positions, the three fields are simply added
2513 | together to form the result.  This means that any integer portion of `zSig'
2514 | will be added into the exponent.  Since a properly normalized significand
2515 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2516 | than the desired result exponent whenever `zSig' is a complete, normalized
2517 | significand.
2518 *----------------------------------------------------------------------------*/
2519 
2520 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2521 {
2522 
2523     return make_float64(
2524         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2525 
2526 }
2527 
2528 /*----------------------------------------------------------------------------
2529 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2530 | and significand `zSig', and returns the proper double-precision floating-
2531 | point value corresponding to the abstract input.  Ordinarily, the abstract
2532 | value is simply rounded and packed into the double-precision format, with
2533 | the inexact exception raised if the abstract input cannot be represented
2534 | exactly.  However, if the abstract value is too large, the overflow and
2535 | inexact exceptions are raised and an infinity or maximal finite value is
2536 | returned.  If the abstract value is too small, the input value is rounded to
2537 | a subnormal number, and the underflow and inexact exceptions are raised if
2538 | the abstract input cannot be represented exactly as a subnormal double-
2539 | precision floating-point number.
2540 |     The input significand `zSig' has its binary point between bits 62
2541 | and 61, which is 10 bits to the left of the usual location.  This shifted
2542 | significand must be normalized or smaller.  If `zSig' is not normalized,
2543 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2544 | and it must not require rounding.  In the usual case that `zSig' is
2545 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2546 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2547 | Binary Floating-Point Arithmetic.
2548 *----------------------------------------------------------------------------*/
2549 
2550 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2551                                    float_status *status)
2552 {
2553     int8_t roundingMode;
2554     flag roundNearestEven;
2555     int roundIncrement, roundBits;
2556     flag isTiny;
2557 
2558     roundingMode = status->float_rounding_mode;
2559     roundNearestEven = ( roundingMode == float_round_nearest_even );
2560     switch (roundingMode) {
2561     case float_round_nearest_even:
2562     case float_round_ties_away:
2563         roundIncrement = 0x200;
2564         break;
2565     case float_round_to_zero:
2566         roundIncrement = 0;
2567         break;
2568     case float_round_up:
2569         roundIncrement = zSign ? 0 : 0x3ff;
2570         break;
2571     case float_round_down:
2572         roundIncrement = zSign ? 0x3ff : 0;
2573         break;
2574     case float_round_to_odd:
2575         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2576         break;
2577     default:
2578         abort();
2579     }
2580     roundBits = zSig & 0x3FF;
2581     if ( 0x7FD <= (uint16_t) zExp ) {
2582         if (    ( 0x7FD < zExp )
2583              || (    ( zExp == 0x7FD )
2584                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2585            ) {
2586             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2587                                    roundIncrement != 0;
2588             float_raise(float_flag_overflow | float_flag_inexact, status);
2589             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2590         }
2591         if ( zExp < 0 ) {
2592             if (status->flush_to_zero) {
2593                 float_raise(float_flag_output_denormal, status);
2594                 return packFloat64(zSign, 0, 0);
2595             }
2596             isTiny =
2597                    (status->float_detect_tininess
2598                     == float_tininess_before_rounding)
2599                 || ( zExp < -1 )
2600                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2601             shift64RightJamming( zSig, - zExp, &zSig );
2602             zExp = 0;
2603             roundBits = zSig & 0x3FF;
2604             if (isTiny && roundBits) {
2605                 float_raise(float_flag_underflow, status);
2606             }
2607             if (roundingMode == float_round_to_odd) {
2608                 /*
2609                  * For round-to-odd case, the roundIncrement depends on
2610                  * zSig which just changed.
2611                  */
2612                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2613             }
2614         }
2615     }
2616     if (roundBits) {
2617         status->float_exception_flags |= float_flag_inexact;
2618     }
2619     zSig = ( zSig + roundIncrement )>>10;
2620     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2621     if ( zSig == 0 ) zExp = 0;
2622     return packFloat64( zSign, zExp, zSig );
2623 
2624 }
2625 
2626 /*----------------------------------------------------------------------------
2627 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2628 | and significand `zSig', and returns the proper double-precision floating-
2629 | point value corresponding to the abstract input.  This routine is just like
2630 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2631 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2632 | floating-point exponent.
2633 *----------------------------------------------------------------------------*/
2634 
2635 static float64
2636  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2637                               float_status *status)
2638 {
2639     int8_t shiftCount;
2640 
2641     shiftCount = countLeadingZeros64( zSig ) - 1;
2642     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2643                                status);
2644 
2645 }
2646 
2647 /*----------------------------------------------------------------------------
2648 | Normalizes the subnormal extended double-precision floating-point value
2649 | represented by the denormalized significand `aSig'.  The normalized exponent
2650 | and significand are stored at the locations pointed to by `zExpPtr' and
2651 | `zSigPtr', respectively.
2652 *----------------------------------------------------------------------------*/
2653 
2654 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2655                                 uint64_t *zSigPtr)
2656 {
2657     int8_t shiftCount;
2658 
2659     shiftCount = countLeadingZeros64( aSig );
2660     *zSigPtr = aSig<<shiftCount;
2661     *zExpPtr = 1 - shiftCount;
2662 }
2663 
2664 /*----------------------------------------------------------------------------
2665 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2666 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
2667 | and returns the proper extended double-precision floating-point value
2668 | corresponding to the abstract input.  Ordinarily, the abstract value is
2669 | rounded and packed into the extended double-precision format, with the
2670 | inexact exception raised if the abstract input cannot be represented
2671 | exactly.  However, if the abstract value is too large, the overflow and
2672 | inexact exceptions are raised and an infinity or maximal finite value is
2673 | returned.  If the abstract value is too small, the input value is rounded to
2674 | a subnormal number, and the underflow and inexact exceptions are raised if
2675 | the abstract input cannot be represented exactly as a subnormal extended
2676 | double-precision floating-point number.
2677 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
2678 | number of bits as single or double precision, respectively.  Otherwise, the
2679 | result is rounded to the full precision of the extended double-precision
2680 | format.
2681 |     The input significand must be normalized or smaller.  If the input
2682 | significand is not normalized, `zExp' must be 0; in that case, the result
2683 | returned is a subnormal number, and it must not require rounding.  The
2684 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
2685 | Floating-Point Arithmetic.
2686 *----------------------------------------------------------------------------*/
2687 
2688 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
2689                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
2690                               float_status *status)
2691 {
2692     int8_t roundingMode;
2693     flag roundNearestEven, increment, isTiny;
2694     int64_t roundIncrement, roundMask, roundBits;
2695 
2696     roundingMode = status->float_rounding_mode;
2697     roundNearestEven = ( roundingMode == float_round_nearest_even );
2698     if ( roundingPrecision == 80 ) goto precision80;
2699     if ( roundingPrecision == 64 ) {
2700         roundIncrement = LIT64( 0x0000000000000400 );
2701         roundMask = LIT64( 0x00000000000007FF );
2702     }
2703     else if ( roundingPrecision == 32 ) {
2704         roundIncrement = LIT64( 0x0000008000000000 );
2705         roundMask = LIT64( 0x000000FFFFFFFFFF );
2706     }
2707     else {
2708         goto precision80;
2709     }
2710     zSig0 |= ( zSig1 != 0 );
2711     switch (roundingMode) {
2712     case float_round_nearest_even:
2713     case float_round_ties_away:
2714         break;
2715     case float_round_to_zero:
2716         roundIncrement = 0;
2717         break;
2718     case float_round_up:
2719         roundIncrement = zSign ? 0 : roundMask;
2720         break;
2721     case float_round_down:
2722         roundIncrement = zSign ? roundMask : 0;
2723         break;
2724     default:
2725         abort();
2726     }
2727     roundBits = zSig0 & roundMask;
2728     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2729         if (    ( 0x7FFE < zExp )
2730              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
2731            ) {
2732             goto overflow;
2733         }
2734         if ( zExp <= 0 ) {
2735             if (status->flush_to_zero) {
2736                 float_raise(float_flag_output_denormal, status);
2737                 return packFloatx80(zSign, 0, 0);
2738             }
2739             isTiny =
2740                    (status->float_detect_tininess
2741                     == float_tininess_before_rounding)
2742                 || ( zExp < 0 )
2743                 || ( zSig0 <= zSig0 + roundIncrement );
2744             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
2745             zExp = 0;
2746             roundBits = zSig0 & roundMask;
2747             if (isTiny && roundBits) {
2748                 float_raise(float_flag_underflow, status);
2749             }
2750             if (roundBits) {
2751                 status->float_exception_flags |= float_flag_inexact;
2752             }
2753             zSig0 += roundIncrement;
2754             if ( (int64_t) zSig0 < 0 ) zExp = 1;
2755             roundIncrement = roundMask + 1;
2756             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2757                 roundMask |= roundIncrement;
2758             }
2759             zSig0 &= ~ roundMask;
2760             return packFloatx80( zSign, zExp, zSig0 );
2761         }
2762     }
2763     if (roundBits) {
2764         status->float_exception_flags |= float_flag_inexact;
2765     }
2766     zSig0 += roundIncrement;
2767     if ( zSig0 < roundIncrement ) {
2768         ++zExp;
2769         zSig0 = LIT64( 0x8000000000000000 );
2770     }
2771     roundIncrement = roundMask + 1;
2772     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
2773         roundMask |= roundIncrement;
2774     }
2775     zSig0 &= ~ roundMask;
2776     if ( zSig0 == 0 ) zExp = 0;
2777     return packFloatx80( zSign, zExp, zSig0 );
2778  precision80:
2779     switch (roundingMode) {
2780     case float_round_nearest_even:
2781     case float_round_ties_away:
2782         increment = ((int64_t)zSig1 < 0);
2783         break;
2784     case float_round_to_zero:
2785         increment = 0;
2786         break;
2787     case float_round_up:
2788         increment = !zSign && zSig1;
2789         break;
2790     case float_round_down:
2791         increment = zSign && zSig1;
2792         break;
2793     default:
2794         abort();
2795     }
2796     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
2797         if (    ( 0x7FFE < zExp )
2798              || (    ( zExp == 0x7FFE )
2799                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
2800                   && increment
2801                 )
2802            ) {
2803             roundMask = 0;
2804  overflow:
2805             float_raise(float_flag_overflow | float_flag_inexact, status);
2806             if (    ( roundingMode == float_round_to_zero )
2807                  || ( zSign && ( roundingMode == float_round_up ) )
2808                  || ( ! zSign && ( roundingMode == float_round_down ) )
2809                ) {
2810                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
2811             }
2812             return packFloatx80(zSign,
2813                                 floatx80_infinity_high,
2814                                 floatx80_infinity_low);
2815         }
2816         if ( zExp <= 0 ) {
2817             isTiny =
2818                    (status->float_detect_tininess
2819                     == float_tininess_before_rounding)
2820                 || ( zExp < 0 )
2821                 || ! increment
2822                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
2823             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
2824             zExp = 0;
2825             if (isTiny && zSig1) {
2826                 float_raise(float_flag_underflow, status);
2827             }
2828             if (zSig1) {
2829                 status->float_exception_flags |= float_flag_inexact;
2830             }
2831             switch (roundingMode) {
2832             case float_round_nearest_even:
2833             case float_round_ties_away:
2834                 increment = ((int64_t)zSig1 < 0);
2835                 break;
2836             case float_round_to_zero:
2837                 increment = 0;
2838                 break;
2839             case float_round_up:
2840                 increment = !zSign && zSig1;
2841                 break;
2842             case float_round_down:
2843                 increment = zSign && zSig1;
2844                 break;
2845             default:
2846                 abort();
2847             }
2848             if ( increment ) {
2849                 ++zSig0;
2850                 zSig0 &=
2851                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2852                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
2853             }
2854             return packFloatx80( zSign, zExp, zSig0 );
2855         }
2856     }
2857     if (zSig1) {
2858         status->float_exception_flags |= float_flag_inexact;
2859     }
2860     if ( increment ) {
2861         ++zSig0;
2862         if ( zSig0 == 0 ) {
2863             ++zExp;
2864             zSig0 = LIT64( 0x8000000000000000 );
2865         }
2866         else {
2867             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
2868         }
2869     }
2870     else {
2871         if ( zSig0 == 0 ) zExp = 0;
2872     }
2873     return packFloatx80( zSign, zExp, zSig0 );
2874 
2875 }
2876 
2877 /*----------------------------------------------------------------------------
2878 | Takes an abstract floating-point value having sign `zSign', exponent
2879 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
2880 | and returns the proper extended double-precision floating-point value
2881 | corresponding to the abstract input.  This routine is just like
2882 | `roundAndPackFloatx80' except that the input significand does not have to be
2883 | normalized.
2884 *----------------------------------------------------------------------------*/
2885 
2886 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
2887                                        flag zSign, int32_t zExp,
2888                                        uint64_t zSig0, uint64_t zSig1,
2889                                        float_status *status)
2890 {
2891     int8_t shiftCount;
2892 
2893     if ( zSig0 == 0 ) {
2894         zSig0 = zSig1;
2895         zSig1 = 0;
2896         zExp -= 64;
2897     }
2898     shiftCount = countLeadingZeros64( zSig0 );
2899     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
2900     zExp -= shiftCount;
2901     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
2902                                 zSig0, zSig1, status);
2903 
2904 }
2905 
2906 /*----------------------------------------------------------------------------
2907 | Returns the least-significant 64 fraction bits of the quadruple-precision
2908 | floating-point value `a'.
2909 *----------------------------------------------------------------------------*/
2910 
2911 static inline uint64_t extractFloat128Frac1( float128 a )
2912 {
2913 
2914     return a.low;
2915 
2916 }
2917 
2918 /*----------------------------------------------------------------------------
2919 | Returns the most-significant 48 fraction bits of the quadruple-precision
2920 | floating-point value `a'.
2921 *----------------------------------------------------------------------------*/
2922 
2923 static inline uint64_t extractFloat128Frac0( float128 a )
2924 {
2925 
2926     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
2927 
2928 }
2929 
2930 /*----------------------------------------------------------------------------
2931 | Returns the exponent bits of the quadruple-precision floating-point value
2932 | `a'.
2933 *----------------------------------------------------------------------------*/
2934 
2935 static inline int32_t extractFloat128Exp( float128 a )
2936 {
2937 
2938     return ( a.high>>48 ) & 0x7FFF;
2939 
2940 }
2941 
2942 /*----------------------------------------------------------------------------
2943 | Returns the sign bit of the quadruple-precision floating-point value `a'.
2944 *----------------------------------------------------------------------------*/
2945 
2946 static inline flag extractFloat128Sign( float128 a )
2947 {
2948 
2949     return a.high>>63;
2950 
2951 }
2952 
2953 /*----------------------------------------------------------------------------
2954 | Normalizes the subnormal quadruple-precision floating-point value
2955 | represented by the denormalized significand formed by the concatenation of
2956 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
2957 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
2958 | significand are stored at the location pointed to by `zSig0Ptr', and the
2959 | least significant 64 bits of the normalized significand are stored at the
2960 | location pointed to by `zSig1Ptr'.
2961 *----------------------------------------------------------------------------*/
2962 
2963 static void
2964  normalizeFloat128Subnormal(
2965      uint64_t aSig0,
2966      uint64_t aSig1,
2967      int32_t *zExpPtr,
2968      uint64_t *zSig0Ptr,
2969      uint64_t *zSig1Ptr
2970  )
2971 {
2972     int8_t shiftCount;
2973 
2974     if ( aSig0 == 0 ) {
2975         shiftCount = countLeadingZeros64( aSig1 ) - 15;
2976         if ( shiftCount < 0 ) {
2977             *zSig0Ptr = aSig1>>( - shiftCount );
2978             *zSig1Ptr = aSig1<<( shiftCount & 63 );
2979         }
2980         else {
2981             *zSig0Ptr = aSig1<<shiftCount;
2982             *zSig1Ptr = 0;
2983         }
2984         *zExpPtr = - shiftCount - 63;
2985     }
2986     else {
2987         shiftCount = countLeadingZeros64( aSig0 ) - 15;
2988         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
2989         *zExpPtr = 1 - shiftCount;
2990     }
2991 
2992 }
2993 
2994 /*----------------------------------------------------------------------------
2995 | Packs the sign `zSign', the exponent `zExp', and the significand formed
2996 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
2997 | floating-point value, returning the result.  After being shifted into the
2998 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
2999 | added together to form the most significant 32 bits of the result.  This
3000 | means that any integer portion of `zSig0' will be added into the exponent.
3001 | Since a properly normalized significand will have an integer portion equal
3002 | to 1, the `zExp' input should be 1 less than the desired result exponent
3003 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3004 | significand.
3005 *----------------------------------------------------------------------------*/
3006 
3007 static inline float128
3008  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3009 {
3010     float128 z;
3011 
3012     z.low = zSig1;
3013     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3014     return z;
3015 
3016 }
3017 
3018 /*----------------------------------------------------------------------------
3019 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3020 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3021 | and `zSig2', and returns the proper quadruple-precision floating-point value
3022 | corresponding to the abstract input.  Ordinarily, the abstract value is
3023 | simply rounded and packed into the quadruple-precision format, with the
3024 | inexact exception raised if the abstract input cannot be represented
3025 | exactly.  However, if the abstract value is too large, the overflow and
3026 | inexact exceptions are raised and an infinity or maximal finite value is
3027 | returned.  If the abstract value is too small, the input value is rounded to
3028 | a subnormal number, and the underflow and inexact exceptions are raised if
3029 | the abstract input cannot be represented exactly as a subnormal quadruple-
3030 | precision floating-point number.
3031 |     The input significand must be normalized or smaller.  If the input
3032 | significand is not normalized, `zExp' must be 0; in that case, the result
3033 | returned is a subnormal number, and it must not require rounding.  In the
3034 | usual case that the input significand is normalized, `zExp' must be 1 less
3035 | than the ``true'' floating-point exponent.  The handling of underflow and
3036 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3037 *----------------------------------------------------------------------------*/
3038 
3039 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3040                                      uint64_t zSig0, uint64_t zSig1,
3041                                      uint64_t zSig2, float_status *status)
3042 {
3043     int8_t roundingMode;
3044     flag roundNearestEven, increment, isTiny;
3045 
3046     roundingMode = status->float_rounding_mode;
3047     roundNearestEven = ( roundingMode == float_round_nearest_even );
3048     switch (roundingMode) {
3049     case float_round_nearest_even:
3050     case float_round_ties_away:
3051         increment = ((int64_t)zSig2 < 0);
3052         break;
3053     case float_round_to_zero:
3054         increment = 0;
3055         break;
3056     case float_round_up:
3057         increment = !zSign && zSig2;
3058         break;
3059     case float_round_down:
3060         increment = zSign && zSig2;
3061         break;
3062     case float_round_to_odd:
3063         increment = !(zSig1 & 0x1) && zSig2;
3064         break;
3065     default:
3066         abort();
3067     }
3068     if ( 0x7FFD <= (uint32_t) zExp ) {
3069         if (    ( 0x7FFD < zExp )
3070              || (    ( zExp == 0x7FFD )
3071                   && eq128(
3072                          LIT64( 0x0001FFFFFFFFFFFF ),
3073                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3074                          zSig0,
3075                          zSig1
3076                      )
3077                   && increment
3078                 )
3079            ) {
3080             float_raise(float_flag_overflow | float_flag_inexact, status);
3081             if (    ( roundingMode == float_round_to_zero )
3082                  || ( zSign && ( roundingMode == float_round_up ) )
3083                  || ( ! zSign && ( roundingMode == float_round_down ) )
3084                  || (roundingMode == float_round_to_odd)
3085                ) {
3086                 return
3087                     packFloat128(
3088                         zSign,
3089                         0x7FFE,
3090                         LIT64( 0x0000FFFFFFFFFFFF ),
3091                         LIT64( 0xFFFFFFFFFFFFFFFF )
3092                     );
3093             }
3094             return packFloat128( zSign, 0x7FFF, 0, 0 );
3095         }
3096         if ( zExp < 0 ) {
3097             if (status->flush_to_zero) {
3098                 float_raise(float_flag_output_denormal, status);
3099                 return packFloat128(zSign, 0, 0, 0);
3100             }
3101             isTiny =
3102                    (status->float_detect_tininess
3103                     == float_tininess_before_rounding)
3104                 || ( zExp < -1 )
3105                 || ! increment
3106                 || lt128(
3107                        zSig0,
3108                        zSig1,
3109                        LIT64( 0x0001FFFFFFFFFFFF ),
3110                        LIT64( 0xFFFFFFFFFFFFFFFF )
3111                    );
3112             shift128ExtraRightJamming(
3113                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3114             zExp = 0;
3115             if (isTiny && zSig2) {
3116                 float_raise(float_flag_underflow, status);
3117             }
3118             switch (roundingMode) {
3119             case float_round_nearest_even:
3120             case float_round_ties_away:
3121                 increment = ((int64_t)zSig2 < 0);
3122                 break;
3123             case float_round_to_zero:
3124                 increment = 0;
3125                 break;
3126             case float_round_up:
3127                 increment = !zSign && zSig2;
3128                 break;
3129             case float_round_down:
3130                 increment = zSign && zSig2;
3131                 break;
3132             case float_round_to_odd:
3133                 increment = !(zSig1 & 0x1) && zSig2;
3134                 break;
3135             default:
3136                 abort();
3137             }
3138         }
3139     }
3140     if (zSig2) {
3141         status->float_exception_flags |= float_flag_inexact;
3142     }
3143     if ( increment ) {
3144         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3145         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3146     }
3147     else {
3148         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3149     }
3150     return packFloat128( zSign, zExp, zSig0, zSig1 );
3151 
3152 }
3153 
3154 /*----------------------------------------------------------------------------
3155 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3156 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3157 | returns the proper quadruple-precision floating-point value corresponding
3158 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3159 | except that the input significand has fewer bits and does not have to be
3160 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3161 | point exponent.
3162 *----------------------------------------------------------------------------*/
3163 
3164 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3165                                               uint64_t zSig0, uint64_t zSig1,
3166                                               float_status *status)
3167 {
3168     int8_t shiftCount;
3169     uint64_t zSig2;
3170 
3171     if ( zSig0 == 0 ) {
3172         zSig0 = zSig1;
3173         zSig1 = 0;
3174         zExp -= 64;
3175     }
3176     shiftCount = countLeadingZeros64( zSig0 ) - 15;
3177     if ( 0 <= shiftCount ) {
3178         zSig2 = 0;
3179         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3180     }
3181     else {
3182         shift128ExtraRightJamming(
3183             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3184     }
3185     zExp -= shiftCount;
3186     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3187 
3188 }
3189 
3190 
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of converting the 32-bit two's complement integer `a'
3193 | to the extended double-precision floating-point format.  The conversion
3194 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3195 | Arithmetic.
3196 *----------------------------------------------------------------------------*/
3197 
3198 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3199 {
3200     flag zSign;
3201     uint32_t absA;
3202     int8_t shiftCount;
3203     uint64_t zSig;
3204 
3205     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3206     zSign = ( a < 0 );
3207     absA = zSign ? - a : a;
3208     shiftCount = countLeadingZeros32( absA ) + 32;
3209     zSig = absA;
3210     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3211 
3212 }
3213 
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of converting the 32-bit two's complement integer `a' to
3216 | the quadruple-precision floating-point format.  The conversion is performed
3217 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3219 
3220 float128 int32_to_float128(int32_t a, float_status *status)
3221 {
3222     flag zSign;
3223     uint32_t absA;
3224     int8_t shiftCount;
3225     uint64_t zSig0;
3226 
3227     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3228     zSign = ( a < 0 );
3229     absA = zSign ? - a : a;
3230     shiftCount = countLeadingZeros32( absA ) + 17;
3231     zSig0 = absA;
3232     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3233 
3234 }
3235 
3236 /*----------------------------------------------------------------------------
3237 | Returns the result of converting the 64-bit two's complement integer `a'
3238 | to the extended double-precision floating-point format.  The conversion
3239 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3240 | Arithmetic.
3241 *----------------------------------------------------------------------------*/
3242 
3243 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3244 {
3245     flag zSign;
3246     uint64_t absA;
3247     int8_t shiftCount;
3248 
3249     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3250     zSign = ( a < 0 );
3251     absA = zSign ? - a : a;
3252     shiftCount = countLeadingZeros64( absA );
3253     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3254 
3255 }
3256 
3257 /*----------------------------------------------------------------------------
3258 | Returns the result of converting the 64-bit two's complement integer `a' to
3259 | the quadruple-precision floating-point format.  The conversion is performed
3260 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3261 *----------------------------------------------------------------------------*/
3262 
3263 float128 int64_to_float128(int64_t a, float_status *status)
3264 {
3265     flag zSign;
3266     uint64_t absA;
3267     int8_t shiftCount;
3268     int32_t zExp;
3269     uint64_t zSig0, zSig1;
3270 
3271     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3272     zSign = ( a < 0 );
3273     absA = zSign ? - a : a;
3274     shiftCount = countLeadingZeros64( absA ) + 49;
3275     zExp = 0x406E - shiftCount;
3276     if ( 64 <= shiftCount ) {
3277         zSig1 = 0;
3278         zSig0 = absA;
3279         shiftCount -= 64;
3280     }
3281     else {
3282         zSig1 = absA;
3283         zSig0 = 0;
3284     }
3285     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3286     return packFloat128( zSign, zExp, zSig0, zSig1 );
3287 
3288 }
3289 
3290 /*----------------------------------------------------------------------------
3291 | Returns the result of converting the 64-bit unsigned integer `a'
3292 | to the quadruple-precision floating-point format.  The conversion is performed
3293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3294 *----------------------------------------------------------------------------*/
3295 
3296 float128 uint64_to_float128(uint64_t a, float_status *status)
3297 {
3298     if (a == 0) {
3299         return float128_zero;
3300     }
3301     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3302 }
3303 
3304 /*----------------------------------------------------------------------------
3305 | Returns the result of converting the single-precision floating-point value
3306 | `a' to the extended double-precision floating-point format.  The conversion
3307 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3308 | Arithmetic.
3309 *----------------------------------------------------------------------------*/
3310 
3311 floatx80 float32_to_floatx80(float32 a, float_status *status)
3312 {
3313     flag aSign;
3314     int aExp;
3315     uint32_t aSig;
3316 
3317     a = float32_squash_input_denormal(a, status);
3318     aSig = extractFloat32Frac( a );
3319     aExp = extractFloat32Exp( a );
3320     aSign = extractFloat32Sign( a );
3321     if ( aExp == 0xFF ) {
3322         if (aSig) {
3323             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3324         }
3325         return packFloatx80(aSign,
3326                             floatx80_infinity_high,
3327                             floatx80_infinity_low);
3328     }
3329     if ( aExp == 0 ) {
3330         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3331         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3332     }
3333     aSig |= 0x00800000;
3334     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3335 
3336 }
3337 
3338 /*----------------------------------------------------------------------------
3339 | Returns the result of converting the single-precision floating-point value
3340 | `a' to the double-precision floating-point format.  The conversion is
3341 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3342 | Arithmetic.
3343 *----------------------------------------------------------------------------*/
3344 
3345 float128 float32_to_float128(float32 a, float_status *status)
3346 {
3347     flag aSign;
3348     int aExp;
3349     uint32_t aSig;
3350 
3351     a = float32_squash_input_denormal(a, status);
3352     aSig = extractFloat32Frac( a );
3353     aExp = extractFloat32Exp( a );
3354     aSign = extractFloat32Sign( a );
3355     if ( aExp == 0xFF ) {
3356         if (aSig) {
3357             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3358         }
3359         return packFloat128( aSign, 0x7FFF, 0, 0 );
3360     }
3361     if ( aExp == 0 ) {
3362         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3363         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3364         --aExp;
3365     }
3366     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3367 
3368 }
3369 
3370 /*----------------------------------------------------------------------------
3371 | Returns the remainder of the single-precision floating-point value `a'
3372 | with respect to the corresponding value `b'.  The operation is performed
3373 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3374 *----------------------------------------------------------------------------*/
3375 
3376 float32 float32_rem(float32 a, float32 b, float_status *status)
3377 {
3378     flag aSign, zSign;
3379     int aExp, bExp, expDiff;
3380     uint32_t aSig, bSig;
3381     uint32_t q;
3382     uint64_t aSig64, bSig64, q64;
3383     uint32_t alternateASig;
3384     int32_t sigMean;
3385     a = float32_squash_input_denormal(a, status);
3386     b = float32_squash_input_denormal(b, status);
3387 
3388     aSig = extractFloat32Frac( a );
3389     aExp = extractFloat32Exp( a );
3390     aSign = extractFloat32Sign( a );
3391     bSig = extractFloat32Frac( b );
3392     bExp = extractFloat32Exp( b );
3393     if ( aExp == 0xFF ) {
3394         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3395             return propagateFloat32NaN(a, b, status);
3396         }
3397         float_raise(float_flag_invalid, status);
3398         return float32_default_nan(status);
3399     }
3400     if ( bExp == 0xFF ) {
3401         if (bSig) {
3402             return propagateFloat32NaN(a, b, status);
3403         }
3404         return a;
3405     }
3406     if ( bExp == 0 ) {
3407         if ( bSig == 0 ) {
3408             float_raise(float_flag_invalid, status);
3409             return float32_default_nan(status);
3410         }
3411         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3412     }
3413     if ( aExp == 0 ) {
3414         if ( aSig == 0 ) return a;
3415         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3416     }
3417     expDiff = aExp - bExp;
3418     aSig |= 0x00800000;
3419     bSig |= 0x00800000;
3420     if ( expDiff < 32 ) {
3421         aSig <<= 8;
3422         bSig <<= 8;
3423         if ( expDiff < 0 ) {
3424             if ( expDiff < -1 ) return a;
3425             aSig >>= 1;
3426         }
3427         q = ( bSig <= aSig );
3428         if ( q ) aSig -= bSig;
3429         if ( 0 < expDiff ) {
3430             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3431             q >>= 32 - expDiff;
3432             bSig >>= 2;
3433             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3434         }
3435         else {
3436             aSig >>= 2;
3437             bSig >>= 2;
3438         }
3439     }
3440     else {
3441         if ( bSig <= aSig ) aSig -= bSig;
3442         aSig64 = ( (uint64_t) aSig )<<40;
3443         bSig64 = ( (uint64_t) bSig )<<40;
3444         expDiff -= 64;
3445         while ( 0 < expDiff ) {
3446             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3447             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3448             aSig64 = - ( ( bSig * q64 )<<38 );
3449             expDiff -= 62;
3450         }
3451         expDiff += 64;
3452         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3453         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3454         q = q64>>( 64 - expDiff );
3455         bSig <<= 6;
3456         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3457     }
3458     do {
3459         alternateASig = aSig;
3460         ++q;
3461         aSig -= bSig;
3462     } while ( 0 <= (int32_t) aSig );
3463     sigMean = aSig + alternateASig;
3464     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3465         aSig = alternateASig;
3466     }
3467     zSign = ( (int32_t) aSig < 0 );
3468     if ( zSign ) aSig = - aSig;
3469     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3470 }
3471 
3472 
3473 
3474 /*----------------------------------------------------------------------------
3475 | Returns the binary exponential of the single-precision floating-point value
3476 | `a'. The operation is performed according to the IEC/IEEE Standard for
3477 | Binary Floating-Point Arithmetic.
3478 |
3479 | Uses the following identities:
3480 |
3481 | 1. -------------------------------------------------------------------------
3482 |      x    x*ln(2)
3483 |     2  = e
3484 |
3485 | 2. -------------------------------------------------------------------------
3486 |                      2     3     4     5           n
3487 |      x        x     x     x     x     x           x
3488 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3489 |               1!    2!    3!    4!    5!          n!
3490 *----------------------------------------------------------------------------*/
3491 
3492 static const float64 float32_exp2_coefficients[15] =
3493 {
3494     const_float64( 0x3ff0000000000000ll ), /*  1 */
3495     const_float64( 0x3fe0000000000000ll ), /*  2 */
3496     const_float64( 0x3fc5555555555555ll ), /*  3 */
3497     const_float64( 0x3fa5555555555555ll ), /*  4 */
3498     const_float64( 0x3f81111111111111ll ), /*  5 */
3499     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3500     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3501     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3502     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3503     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3504     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3505     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3506     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3507     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3508     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3509 };
3510 
3511 float32 float32_exp2(float32 a, float_status *status)
3512 {
3513     flag aSign;
3514     int aExp;
3515     uint32_t aSig;
3516     float64 r, x, xn;
3517     int i;
3518     a = float32_squash_input_denormal(a, status);
3519 
3520     aSig = extractFloat32Frac( a );
3521     aExp = extractFloat32Exp( a );
3522     aSign = extractFloat32Sign( a );
3523 
3524     if ( aExp == 0xFF) {
3525         if (aSig) {
3526             return propagateFloat32NaN(a, float32_zero, status);
3527         }
3528         return (aSign) ? float32_zero : a;
3529     }
3530     if (aExp == 0) {
3531         if (aSig == 0) return float32_one;
3532     }
3533 
3534     float_raise(float_flag_inexact, status);
3535 
3536     /* ******************************* */
3537     /* using float64 for approximation */
3538     /* ******************************* */
3539     x = float32_to_float64(a, status);
3540     x = float64_mul(x, float64_ln2, status);
3541 
3542     xn = x;
3543     r = float64_one;
3544     for (i = 0 ; i < 15 ; i++) {
3545         float64 f;
3546 
3547         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3548         r = float64_add(r, f, status);
3549 
3550         xn = float64_mul(xn, x, status);
3551     }
3552 
3553     return float64_to_float32(r, status);
3554 }
3555 
3556 /*----------------------------------------------------------------------------
3557 | Returns the binary log of the single-precision floating-point value `a'.
3558 | The operation is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561 float32 float32_log2(float32 a, float_status *status)
3562 {
3563     flag aSign, zSign;
3564     int aExp;
3565     uint32_t aSig, zSig, i;
3566 
3567     a = float32_squash_input_denormal(a, status);
3568     aSig = extractFloat32Frac( a );
3569     aExp = extractFloat32Exp( a );
3570     aSign = extractFloat32Sign( a );
3571 
3572     if ( aExp == 0 ) {
3573         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3575     }
3576     if ( aSign ) {
3577         float_raise(float_flag_invalid, status);
3578         return float32_default_nan(status);
3579     }
3580     if ( aExp == 0xFF ) {
3581         if (aSig) {
3582             return propagateFloat32NaN(a, float32_zero, status);
3583         }
3584         return a;
3585     }
3586 
3587     aExp -= 0x7F;
3588     aSig |= 0x00800000;
3589     zSign = aExp < 0;
3590     zSig = aExp << 23;
3591 
3592     for (i = 1 << 22; i > 0; i >>= 1) {
3593         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3594         if ( aSig & 0x01000000 ) {
3595             aSig >>= 1;
3596             zSig |= i;
3597         }
3598     }
3599 
3600     if ( zSign )
3601         zSig = -zSig;
3602 
3603     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3604 }
3605 
3606 /*----------------------------------------------------------------------------
3607 | Returns 1 if the single-precision floating-point value `a' is equal to
3608 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3609 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3611 *----------------------------------------------------------------------------*/
3612 
3613 int float32_eq(float32 a, float32 b, float_status *status)
3614 {
3615     uint32_t av, bv;
3616     a = float32_squash_input_denormal(a, status);
3617     b = float32_squash_input_denormal(b, status);
3618 
3619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3621        ) {
3622         float_raise(float_flag_invalid, status);
3623         return 0;
3624     }
3625     av = float32_val(a);
3626     bv = float32_val(b);
3627     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3628 }
3629 
3630 /*----------------------------------------------------------------------------
3631 | Returns 1 if the single-precision floating-point value `a' is less than
3632 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3633 | exception is raised if either operand is a NaN.  The comparison is performed
3634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3635 *----------------------------------------------------------------------------*/
3636 
3637 int float32_le(float32 a, float32 b, float_status *status)
3638 {
3639     flag aSign, bSign;
3640     uint32_t av, bv;
3641     a = float32_squash_input_denormal(a, status);
3642     b = float32_squash_input_denormal(b, status);
3643 
3644     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3645          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3646        ) {
3647         float_raise(float_flag_invalid, status);
3648         return 0;
3649     }
3650     aSign = extractFloat32Sign( a );
3651     bSign = extractFloat32Sign( b );
3652     av = float32_val(a);
3653     bv = float32_val(b);
3654     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3655     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3656 
3657 }
3658 
3659 /*----------------------------------------------------------------------------
3660 | Returns 1 if the single-precision floating-point value `a' is less than
3661 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3662 | raised if either operand is a NaN.  The comparison is performed according
3663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3665 
3666 int float32_lt(float32 a, float32 b, float_status *status)
3667 {
3668     flag aSign, bSign;
3669     uint32_t av, bv;
3670     a = float32_squash_input_denormal(a, status);
3671     b = float32_squash_input_denormal(b, status);
3672 
3673     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3674          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3675        ) {
3676         float_raise(float_flag_invalid, status);
3677         return 0;
3678     }
3679     aSign = extractFloat32Sign( a );
3680     bSign = extractFloat32Sign( b );
3681     av = float32_val(a);
3682     bv = float32_val(b);
3683     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3684     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3685 
3686 }
3687 
3688 /*----------------------------------------------------------------------------
3689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3690 | be compared, and 0 otherwise.  The invalid exception is raised if either
3691 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3692 | Standard for Binary Floating-Point Arithmetic.
3693 *----------------------------------------------------------------------------*/
3694 
3695 int float32_unordered(float32 a, float32 b, float_status *status)
3696 {
3697     a = float32_squash_input_denormal(a, status);
3698     b = float32_squash_input_denormal(b, status);
3699 
3700     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3701          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3702        ) {
3703         float_raise(float_flag_invalid, status);
3704         return 1;
3705     }
3706     return 0;
3707 }
3708 
3709 /*----------------------------------------------------------------------------
3710 | Returns 1 if the single-precision floating-point value `a' is equal to
3711 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3712 | exception.  The comparison is performed according to the IEC/IEEE Standard
3713 | for Binary Floating-Point Arithmetic.
3714 *----------------------------------------------------------------------------*/
3715 
3716 int float32_eq_quiet(float32 a, float32 b, float_status *status)
3717 {
3718     a = float32_squash_input_denormal(a, status);
3719     b = float32_squash_input_denormal(b, status);
3720 
3721     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3722          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3723        ) {
3724         if (float32_is_signaling_nan(a, status)
3725          || float32_is_signaling_nan(b, status)) {
3726             float_raise(float_flag_invalid, status);
3727         }
3728         return 0;
3729     }
3730     return ( float32_val(a) == float32_val(b) ) ||
3731             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3732 }
3733 
3734 /*----------------------------------------------------------------------------
3735 | Returns 1 if the single-precision floating-point value `a' is less than or
3736 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3737 | cause an exception.  Otherwise, the comparison is performed according to the
3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739 *----------------------------------------------------------------------------*/
3740 
3741 int float32_le_quiet(float32 a, float32 b, float_status *status)
3742 {
3743     flag aSign, bSign;
3744     uint32_t av, bv;
3745     a = float32_squash_input_denormal(a, status);
3746     b = float32_squash_input_denormal(b, status);
3747 
3748     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3749          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3750        ) {
3751         if (float32_is_signaling_nan(a, status)
3752          || float32_is_signaling_nan(b, status)) {
3753             float_raise(float_flag_invalid, status);
3754         }
3755         return 0;
3756     }
3757     aSign = extractFloat32Sign( a );
3758     bSign = extractFloat32Sign( b );
3759     av = float32_val(a);
3760     bv = float32_val(b);
3761     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3762     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3763 
3764 }
3765 
3766 /*----------------------------------------------------------------------------
3767 | Returns 1 if the single-precision floating-point value `a' is less than
3768 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3769 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3770 | Standard for Binary Floating-Point Arithmetic.
3771 *----------------------------------------------------------------------------*/
3772 
3773 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3774 {
3775     flag aSign, bSign;
3776     uint32_t av, bv;
3777     a = float32_squash_input_denormal(a, status);
3778     b = float32_squash_input_denormal(b, status);
3779 
3780     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3781          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3782        ) {
3783         if (float32_is_signaling_nan(a, status)
3784          || float32_is_signaling_nan(b, status)) {
3785             float_raise(float_flag_invalid, status);
3786         }
3787         return 0;
3788     }
3789     aSign = extractFloat32Sign( a );
3790     bSign = extractFloat32Sign( b );
3791     av = float32_val(a);
3792     bv = float32_val(b);
3793     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3794     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3795 
3796 }
3797 
3798 /*----------------------------------------------------------------------------
3799 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3800 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3801 | comparison is performed according to the IEC/IEEE Standard for Binary
3802 | Floating-Point Arithmetic.
3803 *----------------------------------------------------------------------------*/
3804 
3805 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3806 {
3807     a = float32_squash_input_denormal(a, status);
3808     b = float32_squash_input_denormal(b, status);
3809 
3810     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3811          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3812        ) {
3813         if (float32_is_signaling_nan(a, status)
3814          || float32_is_signaling_nan(b, status)) {
3815             float_raise(float_flag_invalid, status);
3816         }
3817         return 1;
3818     }
3819     return 0;
3820 }
3821 
3822 /*----------------------------------------------------------------------------
3823 | If `a' is denormal and we are in flush-to-zero mode then set the
3824 | input-denormal exception and return zero. Otherwise just return the value.
3825 *----------------------------------------------------------------------------*/
3826 float16 float16_squash_input_denormal(float16 a, float_status *status)
3827 {
3828     if (status->flush_inputs_to_zero) {
3829         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
3830             float_raise(float_flag_input_denormal, status);
3831             return make_float16(float16_val(a) & 0x8000);
3832         }
3833     }
3834     return a;
3835 }
3836 
3837 /*----------------------------------------------------------------------------
3838 | Returns the result of converting the double-precision floating-point value
3839 | `a' to the extended double-precision floating-point format.  The conversion
3840 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3841 | Arithmetic.
3842 *----------------------------------------------------------------------------*/
3843 
3844 floatx80 float64_to_floatx80(float64 a, float_status *status)
3845 {
3846     flag aSign;
3847     int aExp;
3848     uint64_t aSig;
3849 
3850     a = float64_squash_input_denormal(a, status);
3851     aSig = extractFloat64Frac( a );
3852     aExp = extractFloat64Exp( a );
3853     aSign = extractFloat64Sign( a );
3854     if ( aExp == 0x7FF ) {
3855         if (aSig) {
3856             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3857         }
3858         return packFloatx80(aSign,
3859                             floatx80_infinity_high,
3860                             floatx80_infinity_low);
3861     }
3862     if ( aExp == 0 ) {
3863         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3864         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3865     }
3866     return
3867         packFloatx80(
3868             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3869 
3870 }
3871 
3872 /*----------------------------------------------------------------------------
3873 | Returns the result of converting the double-precision floating-point value
3874 | `a' to the quadruple-precision floating-point format.  The conversion is
3875 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3876 | Arithmetic.
3877 *----------------------------------------------------------------------------*/
3878 
3879 float128 float64_to_float128(float64 a, float_status *status)
3880 {
3881     flag aSign;
3882     int aExp;
3883     uint64_t aSig, zSig0, zSig1;
3884 
3885     a = float64_squash_input_denormal(a, status);
3886     aSig = extractFloat64Frac( a );
3887     aExp = extractFloat64Exp( a );
3888     aSign = extractFloat64Sign( a );
3889     if ( aExp == 0x7FF ) {
3890         if (aSig) {
3891             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3892         }
3893         return packFloat128( aSign, 0x7FFF, 0, 0 );
3894     }
3895     if ( aExp == 0 ) {
3896         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3897         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3898         --aExp;
3899     }
3900     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3901     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3902 
3903 }
3904 
3905 
3906 /*----------------------------------------------------------------------------
3907 | Returns the remainder of the double-precision floating-point value `a'
3908 | with respect to the corresponding value `b'.  The operation is performed
3909 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3910 *----------------------------------------------------------------------------*/
3911 
3912 float64 float64_rem(float64 a, float64 b, float_status *status)
3913 {
3914     flag aSign, zSign;
3915     int aExp, bExp, expDiff;
3916     uint64_t aSig, bSig;
3917     uint64_t q, alternateASig;
3918     int64_t sigMean;
3919 
3920     a = float64_squash_input_denormal(a, status);
3921     b = float64_squash_input_denormal(b, status);
3922     aSig = extractFloat64Frac( a );
3923     aExp = extractFloat64Exp( a );
3924     aSign = extractFloat64Sign( a );
3925     bSig = extractFloat64Frac( b );
3926     bExp = extractFloat64Exp( b );
3927     if ( aExp == 0x7FF ) {
3928         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3929             return propagateFloat64NaN(a, b, status);
3930         }
3931         float_raise(float_flag_invalid, status);
3932         return float64_default_nan(status);
3933     }
3934     if ( bExp == 0x7FF ) {
3935         if (bSig) {
3936             return propagateFloat64NaN(a, b, status);
3937         }
3938         return a;
3939     }
3940     if ( bExp == 0 ) {
3941         if ( bSig == 0 ) {
3942             float_raise(float_flag_invalid, status);
3943             return float64_default_nan(status);
3944         }
3945         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3946     }
3947     if ( aExp == 0 ) {
3948         if ( aSig == 0 ) return a;
3949         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3950     }
3951     expDiff = aExp - bExp;
3952     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3953     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3954     if ( expDiff < 0 ) {
3955         if ( expDiff < -1 ) return a;
3956         aSig >>= 1;
3957     }
3958     q = ( bSig <= aSig );
3959     if ( q ) aSig -= bSig;
3960     expDiff -= 64;
3961     while ( 0 < expDiff ) {
3962         q = estimateDiv128To64( aSig, 0, bSig );
3963         q = ( 2 < q ) ? q - 2 : 0;
3964         aSig = - ( ( bSig>>2 ) * q );
3965         expDiff -= 62;
3966     }
3967     expDiff += 64;
3968     if ( 0 < expDiff ) {
3969         q = estimateDiv128To64( aSig, 0, bSig );
3970         q = ( 2 < q ) ? q - 2 : 0;
3971         q >>= 64 - expDiff;
3972         bSig >>= 2;
3973         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3974     }
3975     else {
3976         aSig >>= 2;
3977         bSig >>= 2;
3978     }
3979     do {
3980         alternateASig = aSig;
3981         ++q;
3982         aSig -= bSig;
3983     } while ( 0 <= (int64_t) aSig );
3984     sigMean = aSig + alternateASig;
3985     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3986         aSig = alternateASig;
3987     }
3988     zSign = ( (int64_t) aSig < 0 );
3989     if ( zSign ) aSig = - aSig;
3990     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
3991 
3992 }
3993 
3994 /*----------------------------------------------------------------------------
3995 | Returns the binary log of the double-precision floating-point value `a'.
3996 | The operation is performed according to the IEC/IEEE Standard for Binary
3997 | Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999 float64 float64_log2(float64 a, float_status *status)
4000 {
4001     flag aSign, zSign;
4002     int aExp;
4003     uint64_t aSig, aSig0, aSig1, zSig, i;
4004     a = float64_squash_input_denormal(a, status);
4005 
4006     aSig = extractFloat64Frac( a );
4007     aExp = extractFloat64Exp( a );
4008     aSign = extractFloat64Sign( a );
4009 
4010     if ( aExp == 0 ) {
4011         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4012         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4013     }
4014     if ( aSign ) {
4015         float_raise(float_flag_invalid, status);
4016         return float64_default_nan(status);
4017     }
4018     if ( aExp == 0x7FF ) {
4019         if (aSig) {
4020             return propagateFloat64NaN(a, float64_zero, status);
4021         }
4022         return a;
4023     }
4024 
4025     aExp -= 0x3FF;
4026     aSig |= LIT64( 0x0010000000000000 );
4027     zSign = aExp < 0;
4028     zSig = (uint64_t)aExp << 52;
4029     for (i = 1LL << 51; i > 0; i >>= 1) {
4030         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4031         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4032         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4033             aSig >>= 1;
4034             zSig |= i;
4035         }
4036     }
4037 
4038     if ( zSign )
4039         zSig = -zSig;
4040     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4041 }
4042 
4043 /*----------------------------------------------------------------------------
4044 | Returns 1 if the double-precision floating-point value `a' is equal to the
4045 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4046 | if either operand is a NaN.  Otherwise, the comparison is performed
4047 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4049 
4050 int float64_eq(float64 a, float64 b, float_status *status)
4051 {
4052     uint64_t av, bv;
4053     a = float64_squash_input_denormal(a, status);
4054     b = float64_squash_input_denormal(b, status);
4055 
4056     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4057          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4058        ) {
4059         float_raise(float_flag_invalid, status);
4060         return 0;
4061     }
4062     av = float64_val(a);
4063     bv = float64_val(b);
4064     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4065 
4066 }
4067 
4068 /*----------------------------------------------------------------------------
4069 | Returns 1 if the double-precision floating-point value `a' is less than or
4070 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4071 | exception is raised if either operand is a NaN.  The comparison is performed
4072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073 *----------------------------------------------------------------------------*/
4074 
4075 int float64_le(float64 a, float64 b, float_status *status)
4076 {
4077     flag aSign, bSign;
4078     uint64_t av, bv;
4079     a = float64_squash_input_denormal(a, status);
4080     b = float64_squash_input_denormal(b, status);
4081 
4082     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4083          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4084        ) {
4085         float_raise(float_flag_invalid, status);
4086         return 0;
4087     }
4088     aSign = extractFloat64Sign( a );
4089     bSign = extractFloat64Sign( b );
4090     av = float64_val(a);
4091     bv = float64_val(b);
4092     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4093     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4094 
4095 }
4096 
4097 /*----------------------------------------------------------------------------
4098 | Returns 1 if the double-precision floating-point value `a' is less than
4099 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4100 | raised if either operand is a NaN.  The comparison is performed according
4101 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4102 *----------------------------------------------------------------------------*/
4103 
4104 int float64_lt(float64 a, float64 b, float_status *status)
4105 {
4106     flag aSign, bSign;
4107     uint64_t av, bv;
4108 
4109     a = float64_squash_input_denormal(a, status);
4110     b = float64_squash_input_denormal(b, status);
4111     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4112          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4113        ) {
4114         float_raise(float_flag_invalid, status);
4115         return 0;
4116     }
4117     aSign = extractFloat64Sign( a );
4118     bSign = extractFloat64Sign( b );
4119     av = float64_val(a);
4120     bv = float64_val(b);
4121     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4122     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4123 
4124 }
4125 
4126 /*----------------------------------------------------------------------------
4127 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4128 | be compared, and 0 otherwise.  The invalid exception is raised if either
4129 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4130 | Standard for Binary Floating-Point Arithmetic.
4131 *----------------------------------------------------------------------------*/
4132 
4133 int float64_unordered(float64 a, float64 b, float_status *status)
4134 {
4135     a = float64_squash_input_denormal(a, status);
4136     b = float64_squash_input_denormal(b, status);
4137 
4138     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4139          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4140        ) {
4141         float_raise(float_flag_invalid, status);
4142         return 1;
4143     }
4144     return 0;
4145 }
4146 
4147 /*----------------------------------------------------------------------------
4148 | Returns 1 if the double-precision floating-point value `a' is equal to the
4149 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4150 | exception.The comparison is performed according to the IEC/IEEE Standard
4151 | for Binary Floating-Point Arithmetic.
4152 *----------------------------------------------------------------------------*/
4153 
4154 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4155 {
4156     uint64_t av, bv;
4157     a = float64_squash_input_denormal(a, status);
4158     b = float64_squash_input_denormal(b, status);
4159 
4160     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4161          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4162        ) {
4163         if (float64_is_signaling_nan(a, status)
4164          || float64_is_signaling_nan(b, status)) {
4165             float_raise(float_flag_invalid, status);
4166         }
4167         return 0;
4168     }
4169     av = float64_val(a);
4170     bv = float64_val(b);
4171     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4172 
4173 }
4174 
4175 /*----------------------------------------------------------------------------
4176 | Returns 1 if the double-precision floating-point value `a' is less than or
4177 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4178 | cause an exception.  Otherwise, the comparison is performed according to the
4179 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4180 *----------------------------------------------------------------------------*/
4181 
4182 int float64_le_quiet(float64 a, float64 b, float_status *status)
4183 {
4184     flag aSign, bSign;
4185     uint64_t av, bv;
4186     a = float64_squash_input_denormal(a, status);
4187     b = float64_squash_input_denormal(b, status);
4188 
4189     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4190          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4191        ) {
4192         if (float64_is_signaling_nan(a, status)
4193          || float64_is_signaling_nan(b, status)) {
4194             float_raise(float_flag_invalid, status);
4195         }
4196         return 0;
4197     }
4198     aSign = extractFloat64Sign( a );
4199     bSign = extractFloat64Sign( b );
4200     av = float64_val(a);
4201     bv = float64_val(b);
4202     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4203     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4204 
4205 }
4206 
4207 /*----------------------------------------------------------------------------
4208 | Returns 1 if the double-precision floating-point value `a' is less than
4209 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4210 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4211 | Standard for Binary Floating-Point Arithmetic.
4212 *----------------------------------------------------------------------------*/
4213 
4214 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4215 {
4216     flag aSign, bSign;
4217     uint64_t av, bv;
4218     a = float64_squash_input_denormal(a, status);
4219     b = float64_squash_input_denormal(b, status);
4220 
4221     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4222          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4223        ) {
4224         if (float64_is_signaling_nan(a, status)
4225          || float64_is_signaling_nan(b, status)) {
4226             float_raise(float_flag_invalid, status);
4227         }
4228         return 0;
4229     }
4230     aSign = extractFloat64Sign( a );
4231     bSign = extractFloat64Sign( b );
4232     av = float64_val(a);
4233     bv = float64_val(b);
4234     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4235     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4236 
4237 }
4238 
4239 /*----------------------------------------------------------------------------
4240 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4241 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4242 | comparison is performed according to the IEC/IEEE Standard for Binary
4243 | Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4245 
4246 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4247 {
4248     a = float64_squash_input_denormal(a, status);
4249     b = float64_squash_input_denormal(b, status);
4250 
4251     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4252          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4253        ) {
4254         if (float64_is_signaling_nan(a, status)
4255          || float64_is_signaling_nan(b, status)) {
4256             float_raise(float_flag_invalid, status);
4257         }
4258         return 1;
4259     }
4260     return 0;
4261 }
4262 
4263 /*----------------------------------------------------------------------------
4264 | Returns the result of converting the extended double-precision floating-
4265 | point value `a' to the 32-bit two's complement integer format.  The
4266 | conversion is performed according to the IEC/IEEE Standard for Binary
4267 | Floating-Point Arithmetic---which means in particular that the conversion
4268 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4269 | largest positive integer is returned.  Otherwise, if the conversion
4270 | overflows, the largest integer with the same sign as `a' is returned.
4271 *----------------------------------------------------------------------------*/
4272 
4273 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4274 {
4275     flag aSign;
4276     int32_t aExp, shiftCount;
4277     uint64_t aSig;
4278 
4279     if (floatx80_invalid_encoding(a)) {
4280         float_raise(float_flag_invalid, status);
4281         return 1 << 31;
4282     }
4283     aSig = extractFloatx80Frac( a );
4284     aExp = extractFloatx80Exp( a );
4285     aSign = extractFloatx80Sign( a );
4286     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4287     shiftCount = 0x4037 - aExp;
4288     if ( shiftCount <= 0 ) shiftCount = 1;
4289     shift64RightJamming( aSig, shiftCount, &aSig );
4290     return roundAndPackInt32(aSign, aSig, status);
4291 
4292 }
4293 
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the extended double-precision floating-
4296 | point value `a' to the 32-bit two's complement integer format.  The
4297 | conversion is performed according to the IEC/IEEE Standard for Binary
4298 | Floating-Point Arithmetic, except that the conversion is always rounded
4299 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4300 | Otherwise, if the conversion overflows, the largest integer with the same
4301 | sign as `a' is returned.
4302 *----------------------------------------------------------------------------*/
4303 
4304 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4305 {
4306     flag aSign;
4307     int32_t aExp, shiftCount;
4308     uint64_t aSig, savedASig;
4309     int32_t z;
4310 
4311     if (floatx80_invalid_encoding(a)) {
4312         float_raise(float_flag_invalid, status);
4313         return 1 << 31;
4314     }
4315     aSig = extractFloatx80Frac( a );
4316     aExp = extractFloatx80Exp( a );
4317     aSign = extractFloatx80Sign( a );
4318     if ( 0x401E < aExp ) {
4319         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4320         goto invalid;
4321     }
4322     else if ( aExp < 0x3FFF ) {
4323         if (aExp || aSig) {
4324             status->float_exception_flags |= float_flag_inexact;
4325         }
4326         return 0;
4327     }
4328     shiftCount = 0x403E - aExp;
4329     savedASig = aSig;
4330     aSig >>= shiftCount;
4331     z = aSig;
4332     if ( aSign ) z = - z;
4333     if ( ( z < 0 ) ^ aSign ) {
4334  invalid:
4335         float_raise(float_flag_invalid, status);
4336         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4337     }
4338     if ( ( aSig<<shiftCount ) != savedASig ) {
4339         status->float_exception_flags |= float_flag_inexact;
4340     }
4341     return z;
4342 
4343 }
4344 
4345 /*----------------------------------------------------------------------------
4346 | Returns the result of converting the extended double-precision floating-
4347 | point value `a' to the 64-bit two's complement integer format.  The
4348 | conversion is performed according to the IEC/IEEE Standard for Binary
4349 | Floating-Point Arithmetic---which means in particular that the conversion
4350 | is rounded according to the current rounding mode.  If `a' is a NaN,
4351 | the largest positive integer is returned.  Otherwise, if the conversion
4352 | overflows, the largest integer with the same sign as `a' is returned.
4353 *----------------------------------------------------------------------------*/
4354 
4355 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4356 {
4357     flag aSign;
4358     int32_t aExp, shiftCount;
4359     uint64_t aSig, aSigExtra;
4360 
4361     if (floatx80_invalid_encoding(a)) {
4362         float_raise(float_flag_invalid, status);
4363         return 1ULL << 63;
4364     }
4365     aSig = extractFloatx80Frac( a );
4366     aExp = extractFloatx80Exp( a );
4367     aSign = extractFloatx80Sign( a );
4368     shiftCount = 0x403E - aExp;
4369     if ( shiftCount <= 0 ) {
4370         if ( shiftCount ) {
4371             float_raise(float_flag_invalid, status);
4372             if (!aSign || floatx80_is_any_nan(a)) {
4373                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4374             }
4375             return (int64_t) LIT64( 0x8000000000000000 );
4376         }
4377         aSigExtra = 0;
4378     }
4379     else {
4380         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4381     }
4382     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4383 
4384 }
4385 
4386 /*----------------------------------------------------------------------------
4387 | Returns the result of converting the extended double-precision floating-
4388 | point value `a' to the 64-bit two's complement integer format.  The
4389 | conversion is performed according to the IEC/IEEE Standard for Binary
4390 | Floating-Point Arithmetic, except that the conversion is always rounded
4391 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4392 | Otherwise, if the conversion overflows, the largest integer with the same
4393 | sign as `a' is returned.
4394 *----------------------------------------------------------------------------*/
4395 
4396 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4397 {
4398     flag aSign;
4399     int32_t aExp, shiftCount;
4400     uint64_t aSig;
4401     int64_t z;
4402 
4403     if (floatx80_invalid_encoding(a)) {
4404         float_raise(float_flag_invalid, status);
4405         return 1ULL << 63;
4406     }
4407     aSig = extractFloatx80Frac( a );
4408     aExp = extractFloatx80Exp( a );
4409     aSign = extractFloatx80Sign( a );
4410     shiftCount = aExp - 0x403E;
4411     if ( 0 <= shiftCount ) {
4412         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4413         if ( ( a.high != 0xC03E ) || aSig ) {
4414             float_raise(float_flag_invalid, status);
4415             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4416                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4417             }
4418         }
4419         return (int64_t) LIT64( 0x8000000000000000 );
4420     }
4421     else if ( aExp < 0x3FFF ) {
4422         if (aExp | aSig) {
4423             status->float_exception_flags |= float_flag_inexact;
4424         }
4425         return 0;
4426     }
4427     z = aSig>>( - shiftCount );
4428     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4429         status->float_exception_flags |= float_flag_inexact;
4430     }
4431     if ( aSign ) z = - z;
4432     return z;
4433 
4434 }
4435 
4436 /*----------------------------------------------------------------------------
4437 | Returns the result of converting the extended double-precision floating-
4438 | point value `a' to the single-precision floating-point format.  The
4439 | conversion is performed according to the IEC/IEEE Standard for Binary
4440 | Floating-Point Arithmetic.
4441 *----------------------------------------------------------------------------*/
4442 
4443 float32 floatx80_to_float32(floatx80 a, float_status *status)
4444 {
4445     flag aSign;
4446     int32_t aExp;
4447     uint64_t aSig;
4448 
4449     if (floatx80_invalid_encoding(a)) {
4450         float_raise(float_flag_invalid, status);
4451         return float32_default_nan(status);
4452     }
4453     aSig = extractFloatx80Frac( a );
4454     aExp = extractFloatx80Exp( a );
4455     aSign = extractFloatx80Sign( a );
4456     if ( aExp == 0x7FFF ) {
4457         if ( (uint64_t) ( aSig<<1 ) ) {
4458             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4459         }
4460         return packFloat32( aSign, 0xFF, 0 );
4461     }
4462     shift64RightJamming( aSig, 33, &aSig );
4463     if ( aExp || aSig ) aExp -= 0x3F81;
4464     return roundAndPackFloat32(aSign, aExp, aSig, status);
4465 
4466 }
4467 
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the extended double-precision floating-
4470 | point value `a' to the double-precision floating-point format.  The
4471 | conversion is performed according to the IEC/IEEE Standard for Binary
4472 | Floating-Point Arithmetic.
4473 *----------------------------------------------------------------------------*/
4474 
4475 float64 floatx80_to_float64(floatx80 a, float_status *status)
4476 {
4477     flag aSign;
4478     int32_t aExp;
4479     uint64_t aSig, zSig;
4480 
4481     if (floatx80_invalid_encoding(a)) {
4482         float_raise(float_flag_invalid, status);
4483         return float64_default_nan(status);
4484     }
4485     aSig = extractFloatx80Frac( a );
4486     aExp = extractFloatx80Exp( a );
4487     aSign = extractFloatx80Sign( a );
4488     if ( aExp == 0x7FFF ) {
4489         if ( (uint64_t) ( aSig<<1 ) ) {
4490             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4491         }
4492         return packFloat64( aSign, 0x7FF, 0 );
4493     }
4494     shift64RightJamming( aSig, 1, &zSig );
4495     if ( aExp || aSig ) aExp -= 0x3C01;
4496     return roundAndPackFloat64(aSign, aExp, zSig, status);
4497 
4498 }
4499 
4500 /*----------------------------------------------------------------------------
4501 | Returns the result of converting the extended double-precision floating-
4502 | point value `a' to the quadruple-precision floating-point format.  The
4503 | conversion is performed according to the IEC/IEEE Standard for Binary
4504 | Floating-Point Arithmetic.
4505 *----------------------------------------------------------------------------*/
4506 
4507 float128 floatx80_to_float128(floatx80 a, float_status *status)
4508 {
4509     flag aSign;
4510     int aExp;
4511     uint64_t aSig, zSig0, zSig1;
4512 
4513     if (floatx80_invalid_encoding(a)) {
4514         float_raise(float_flag_invalid, status);
4515         return float128_default_nan(status);
4516     }
4517     aSig = extractFloatx80Frac( a );
4518     aExp = extractFloatx80Exp( a );
4519     aSign = extractFloatx80Sign( a );
4520     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4521         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4522     }
4523     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4524     return packFloat128( aSign, aExp, zSig0, zSig1 );
4525 
4526 }
4527 
4528 /*----------------------------------------------------------------------------
4529 | Rounds the extended double-precision floating-point value `a'
4530 | to the precision provided by floatx80_rounding_precision and returns the
4531 | result as an extended double-precision floating-point value.
4532 | The operation is performed according to the IEC/IEEE Standard for Binary
4533 | Floating-Point Arithmetic.
4534 *----------------------------------------------------------------------------*/
4535 
4536 floatx80 floatx80_round(floatx80 a, float_status *status)
4537 {
4538     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4539                                 extractFloatx80Sign(a),
4540                                 extractFloatx80Exp(a),
4541                                 extractFloatx80Frac(a), 0, status);
4542 }
4543 
4544 /*----------------------------------------------------------------------------
4545 | Rounds the extended double-precision floating-point value `a' to an integer,
4546 | and returns the result as an extended quadruple-precision floating-point
4547 | value.  The operation is performed according to the IEC/IEEE Standard for
4548 | Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4550 
4551 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4552 {
4553     flag aSign;
4554     int32_t aExp;
4555     uint64_t lastBitMask, roundBitsMask;
4556     floatx80 z;
4557 
4558     if (floatx80_invalid_encoding(a)) {
4559         float_raise(float_flag_invalid, status);
4560         return floatx80_default_nan(status);
4561     }
4562     aExp = extractFloatx80Exp( a );
4563     if ( 0x403E <= aExp ) {
4564         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4565             return propagateFloatx80NaN(a, a, status);
4566         }
4567         return a;
4568     }
4569     if ( aExp < 0x3FFF ) {
4570         if (    ( aExp == 0 )
4571              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4572             return a;
4573         }
4574         status->float_exception_flags |= float_flag_inexact;
4575         aSign = extractFloatx80Sign( a );
4576         switch (status->float_rounding_mode) {
4577          case float_round_nearest_even:
4578             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4579                ) {
4580                 return
4581                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4582             }
4583             break;
4584         case float_round_ties_away:
4585             if (aExp == 0x3FFE) {
4586                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4587             }
4588             break;
4589          case float_round_down:
4590             return
4591                   aSign ?
4592                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4593                 : packFloatx80( 0, 0, 0 );
4594          case float_round_up:
4595             return
4596                   aSign ? packFloatx80( 1, 0, 0 )
4597                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4598         }
4599         return packFloatx80( aSign, 0, 0 );
4600     }
4601     lastBitMask = 1;
4602     lastBitMask <<= 0x403E - aExp;
4603     roundBitsMask = lastBitMask - 1;
4604     z = a;
4605     switch (status->float_rounding_mode) {
4606     case float_round_nearest_even:
4607         z.low += lastBitMask>>1;
4608         if ((z.low & roundBitsMask) == 0) {
4609             z.low &= ~lastBitMask;
4610         }
4611         break;
4612     case float_round_ties_away:
4613         z.low += lastBitMask >> 1;
4614         break;
4615     case float_round_to_zero:
4616         break;
4617     case float_round_up:
4618         if (!extractFloatx80Sign(z)) {
4619             z.low += roundBitsMask;
4620         }
4621         break;
4622     case float_round_down:
4623         if (extractFloatx80Sign(z)) {
4624             z.low += roundBitsMask;
4625         }
4626         break;
4627     default:
4628         abort();
4629     }
4630     z.low &= ~ roundBitsMask;
4631     if ( z.low == 0 ) {
4632         ++z.high;
4633         z.low = LIT64( 0x8000000000000000 );
4634     }
4635     if (z.low != a.low) {
4636         status->float_exception_flags |= float_flag_inexact;
4637     }
4638     return z;
4639 
4640 }
4641 
4642 /*----------------------------------------------------------------------------
4643 | Returns the result of adding the absolute values of the extended double-
4644 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4645 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4646 | The addition is performed according to the IEC/IEEE Standard for Binary
4647 | Floating-Point Arithmetic.
4648 *----------------------------------------------------------------------------*/
4649 
4650 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4651                                 float_status *status)
4652 {
4653     int32_t aExp, bExp, zExp;
4654     uint64_t aSig, bSig, zSig0, zSig1;
4655     int32_t expDiff;
4656 
4657     aSig = extractFloatx80Frac( a );
4658     aExp = extractFloatx80Exp( a );
4659     bSig = extractFloatx80Frac( b );
4660     bExp = extractFloatx80Exp( b );
4661     expDiff = aExp - bExp;
4662     if ( 0 < expDiff ) {
4663         if ( aExp == 0x7FFF ) {
4664             if ((uint64_t)(aSig << 1)) {
4665                 return propagateFloatx80NaN(a, b, status);
4666             }
4667             return a;
4668         }
4669         if ( bExp == 0 ) --expDiff;
4670         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4671         zExp = aExp;
4672     }
4673     else if ( expDiff < 0 ) {
4674         if ( bExp == 0x7FFF ) {
4675             if ((uint64_t)(bSig << 1)) {
4676                 return propagateFloatx80NaN(a, b, status);
4677             }
4678             return packFloatx80(zSign,
4679                                 floatx80_infinity_high,
4680                                 floatx80_infinity_low);
4681         }
4682         if ( aExp == 0 ) ++expDiff;
4683         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4684         zExp = bExp;
4685     }
4686     else {
4687         if ( aExp == 0x7FFF ) {
4688             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4689                 return propagateFloatx80NaN(a, b, status);
4690             }
4691             return a;
4692         }
4693         zSig1 = 0;
4694         zSig0 = aSig + bSig;
4695         if ( aExp == 0 ) {
4696             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4697             goto roundAndPack;
4698         }
4699         zExp = aExp;
4700         goto shiftRight1;
4701     }
4702     zSig0 = aSig + bSig;
4703     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4704  shiftRight1:
4705     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4706     zSig0 |= LIT64( 0x8000000000000000 );
4707     ++zExp;
4708  roundAndPack:
4709     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4710                                 zSign, zExp, zSig0, zSig1, status);
4711 }
4712 
4713 /*----------------------------------------------------------------------------
4714 | Returns the result of subtracting the absolute values of the extended
4715 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4716 | difference is negated before being returned.  `zSign' is ignored if the
4717 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4718 | Standard for Binary Floating-Point Arithmetic.
4719 *----------------------------------------------------------------------------*/
4720 
4721 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4722                                 float_status *status)
4723 {
4724     int32_t aExp, bExp, zExp;
4725     uint64_t aSig, bSig, zSig0, zSig1;
4726     int32_t expDiff;
4727 
4728     aSig = extractFloatx80Frac( a );
4729     aExp = extractFloatx80Exp( a );
4730     bSig = extractFloatx80Frac( b );
4731     bExp = extractFloatx80Exp( b );
4732     expDiff = aExp - bExp;
4733     if ( 0 < expDiff ) goto aExpBigger;
4734     if ( expDiff < 0 ) goto bExpBigger;
4735     if ( aExp == 0x7FFF ) {
4736         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4737             return propagateFloatx80NaN(a, b, status);
4738         }
4739         float_raise(float_flag_invalid, status);
4740         return floatx80_default_nan(status);
4741     }
4742     if ( aExp == 0 ) {
4743         aExp = 1;
4744         bExp = 1;
4745     }
4746     zSig1 = 0;
4747     if ( bSig < aSig ) goto aBigger;
4748     if ( aSig < bSig ) goto bBigger;
4749     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
4750  bExpBigger:
4751     if ( bExp == 0x7FFF ) {
4752         if ((uint64_t)(bSig << 1)) {
4753             return propagateFloatx80NaN(a, b, status);
4754         }
4755         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
4756                             floatx80_infinity_low);
4757     }
4758     if ( aExp == 0 ) ++expDiff;
4759     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4760  bBigger:
4761     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4762     zExp = bExp;
4763     zSign ^= 1;
4764     goto normalizeRoundAndPack;
4765  aExpBigger:
4766     if ( aExp == 0x7FFF ) {
4767         if ((uint64_t)(aSig << 1)) {
4768             return propagateFloatx80NaN(a, b, status);
4769         }
4770         return a;
4771     }
4772     if ( bExp == 0 ) --expDiff;
4773     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4774  aBigger:
4775     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4776     zExp = aExp;
4777  normalizeRoundAndPack:
4778     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
4779                                          zSign, zExp, zSig0, zSig1, status);
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Returns the result of adding the extended double-precision floating-point
4784 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4785 | Standard for Binary Floating-Point Arithmetic.
4786 *----------------------------------------------------------------------------*/
4787 
4788 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
4789 {
4790     flag aSign, bSign;
4791 
4792     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4793         float_raise(float_flag_invalid, status);
4794         return floatx80_default_nan(status);
4795     }
4796     aSign = extractFloatx80Sign( a );
4797     bSign = extractFloatx80Sign( b );
4798     if ( aSign == bSign ) {
4799         return addFloatx80Sigs(a, b, aSign, status);
4800     }
4801     else {
4802         return subFloatx80Sigs(a, b, aSign, status);
4803     }
4804 
4805 }
4806 
4807 /*----------------------------------------------------------------------------
4808 | Returns the result of subtracting the extended double-precision floating-
4809 | point values `a' and `b'.  The operation is performed according to the
4810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4811 *----------------------------------------------------------------------------*/
4812 
4813 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
4814 {
4815     flag aSign, bSign;
4816 
4817     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4818         float_raise(float_flag_invalid, status);
4819         return floatx80_default_nan(status);
4820     }
4821     aSign = extractFloatx80Sign( a );
4822     bSign = extractFloatx80Sign( b );
4823     if ( aSign == bSign ) {
4824         return subFloatx80Sigs(a, b, aSign, status);
4825     }
4826     else {
4827         return addFloatx80Sigs(a, b, aSign, status);
4828     }
4829 
4830 }
4831 
4832 /*----------------------------------------------------------------------------
4833 | Returns the result of multiplying the extended double-precision floating-
4834 | point values `a' and `b'.  The operation is performed according to the
4835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4836 *----------------------------------------------------------------------------*/
4837 
4838 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
4839 {
4840     flag aSign, bSign, zSign;
4841     int32_t aExp, bExp, zExp;
4842     uint64_t aSig, bSig, zSig0, zSig1;
4843 
4844     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4845         float_raise(float_flag_invalid, status);
4846         return floatx80_default_nan(status);
4847     }
4848     aSig = extractFloatx80Frac( a );
4849     aExp = extractFloatx80Exp( a );
4850     aSign = extractFloatx80Sign( a );
4851     bSig = extractFloatx80Frac( b );
4852     bExp = extractFloatx80Exp( b );
4853     bSign = extractFloatx80Sign( b );
4854     zSign = aSign ^ bSign;
4855     if ( aExp == 0x7FFF ) {
4856         if (    (uint64_t) ( aSig<<1 )
4857              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4858             return propagateFloatx80NaN(a, b, status);
4859         }
4860         if ( ( bExp | bSig ) == 0 ) goto invalid;
4861         return packFloatx80(zSign, floatx80_infinity_high,
4862                                    floatx80_infinity_low);
4863     }
4864     if ( bExp == 0x7FFF ) {
4865         if ((uint64_t)(bSig << 1)) {
4866             return propagateFloatx80NaN(a, b, status);
4867         }
4868         if ( ( aExp | aSig ) == 0 ) {
4869  invalid:
4870             float_raise(float_flag_invalid, status);
4871             return floatx80_default_nan(status);
4872         }
4873         return packFloatx80(zSign, floatx80_infinity_high,
4874                                    floatx80_infinity_low);
4875     }
4876     if ( aExp == 0 ) {
4877         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4878         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4879     }
4880     if ( bExp == 0 ) {
4881         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4882         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4883     }
4884     zExp = aExp + bExp - 0x3FFE;
4885     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4886     if ( 0 < (int64_t) zSig0 ) {
4887         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4888         --zExp;
4889     }
4890     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4891                                 zSign, zExp, zSig0, zSig1, status);
4892 }
4893 
4894 /*----------------------------------------------------------------------------
4895 | Returns the result of dividing the extended double-precision floating-point
4896 | value `a' by the corresponding value `b'.  The operation is performed
4897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4898 *----------------------------------------------------------------------------*/
4899 
4900 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
4901 {
4902     flag aSign, bSign, zSign;
4903     int32_t aExp, bExp, zExp;
4904     uint64_t aSig, bSig, zSig0, zSig1;
4905     uint64_t rem0, rem1, rem2, term0, term1, term2;
4906 
4907     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4908         float_raise(float_flag_invalid, status);
4909         return floatx80_default_nan(status);
4910     }
4911     aSig = extractFloatx80Frac( a );
4912     aExp = extractFloatx80Exp( a );
4913     aSign = extractFloatx80Sign( a );
4914     bSig = extractFloatx80Frac( b );
4915     bExp = extractFloatx80Exp( b );
4916     bSign = extractFloatx80Sign( b );
4917     zSign = aSign ^ bSign;
4918     if ( aExp == 0x7FFF ) {
4919         if ((uint64_t)(aSig << 1)) {
4920             return propagateFloatx80NaN(a, b, status);
4921         }
4922         if ( bExp == 0x7FFF ) {
4923             if ((uint64_t)(bSig << 1)) {
4924                 return propagateFloatx80NaN(a, b, status);
4925             }
4926             goto invalid;
4927         }
4928         return packFloatx80(zSign, floatx80_infinity_high,
4929                                    floatx80_infinity_low);
4930     }
4931     if ( bExp == 0x7FFF ) {
4932         if ((uint64_t)(bSig << 1)) {
4933             return propagateFloatx80NaN(a, b, status);
4934         }
4935         return packFloatx80( zSign, 0, 0 );
4936     }
4937     if ( bExp == 0 ) {
4938         if ( bSig == 0 ) {
4939             if ( ( aExp | aSig ) == 0 ) {
4940  invalid:
4941                 float_raise(float_flag_invalid, status);
4942                 return floatx80_default_nan(status);
4943             }
4944             float_raise(float_flag_divbyzero, status);
4945             return packFloatx80(zSign, floatx80_infinity_high,
4946                                        floatx80_infinity_low);
4947         }
4948         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4949     }
4950     if ( aExp == 0 ) {
4951         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4952         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4953     }
4954     zExp = aExp - bExp + 0x3FFE;
4955     rem1 = 0;
4956     if ( bSig <= aSig ) {
4957         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4958         ++zExp;
4959     }
4960     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4961     mul64To128( bSig, zSig0, &term0, &term1 );
4962     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4963     while ( (int64_t) rem0 < 0 ) {
4964         --zSig0;
4965         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4966     }
4967     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4968     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4969         mul64To128( bSig, zSig1, &term1, &term2 );
4970         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4971         while ( (int64_t) rem1 < 0 ) {
4972             --zSig1;
4973             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4974         }
4975         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4976     }
4977     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4978                                 zSign, zExp, zSig0, zSig1, status);
4979 }
4980 
4981 /*----------------------------------------------------------------------------
4982 | Returns the remainder of the extended double-precision floating-point value
4983 | `a' with respect to the corresponding value `b'.  The operation is performed
4984 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4985 *----------------------------------------------------------------------------*/
4986 
4987 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
4988 {
4989     flag aSign, zSign;
4990     int32_t aExp, bExp, expDiff;
4991     uint64_t aSig0, aSig1, bSig;
4992     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4993 
4994     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
4995         float_raise(float_flag_invalid, status);
4996         return floatx80_default_nan(status);
4997     }
4998     aSig0 = extractFloatx80Frac( a );
4999     aExp = extractFloatx80Exp( a );
5000     aSign = extractFloatx80Sign( a );
5001     bSig = extractFloatx80Frac( b );
5002     bExp = extractFloatx80Exp( b );
5003     if ( aExp == 0x7FFF ) {
5004         if (    (uint64_t) ( aSig0<<1 )
5005              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5006             return propagateFloatx80NaN(a, b, status);
5007         }
5008         goto invalid;
5009     }
5010     if ( bExp == 0x7FFF ) {
5011         if ((uint64_t)(bSig << 1)) {
5012             return propagateFloatx80NaN(a, b, status);
5013         }
5014         return a;
5015     }
5016     if ( bExp == 0 ) {
5017         if ( bSig == 0 ) {
5018  invalid:
5019             float_raise(float_flag_invalid, status);
5020             return floatx80_default_nan(status);
5021         }
5022         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5023     }
5024     if ( aExp == 0 ) {
5025         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5026         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5027     }
5028     bSig |= LIT64( 0x8000000000000000 );
5029     zSign = aSign;
5030     expDiff = aExp - bExp;
5031     aSig1 = 0;
5032     if ( expDiff < 0 ) {
5033         if ( expDiff < -1 ) return a;
5034         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5035         expDiff = 0;
5036     }
5037     q = ( bSig <= aSig0 );
5038     if ( q ) aSig0 -= bSig;
5039     expDiff -= 64;
5040     while ( 0 < expDiff ) {
5041         q = estimateDiv128To64( aSig0, aSig1, bSig );
5042         q = ( 2 < q ) ? q - 2 : 0;
5043         mul64To128( bSig, q, &term0, &term1 );
5044         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5045         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5046         expDiff -= 62;
5047     }
5048     expDiff += 64;
5049     if ( 0 < expDiff ) {
5050         q = estimateDiv128To64( aSig0, aSig1, bSig );
5051         q = ( 2 < q ) ? q - 2 : 0;
5052         q >>= 64 - expDiff;
5053         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5054         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5055         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5056         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5057             ++q;
5058             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5059         }
5060     }
5061     else {
5062         term1 = 0;
5063         term0 = bSig;
5064     }
5065     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5066     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5067          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5068               && ( q & 1 ) )
5069        ) {
5070         aSig0 = alternateASig0;
5071         aSig1 = alternateASig1;
5072         zSign = ! zSign;
5073     }
5074     return
5075         normalizeRoundAndPackFloatx80(
5076             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5077 
5078 }
5079 
5080 /*----------------------------------------------------------------------------
5081 | Returns the square root of the extended double-precision floating-point
5082 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5083 | for Binary Floating-Point Arithmetic.
5084 *----------------------------------------------------------------------------*/
5085 
5086 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5087 {
5088     flag aSign;
5089     int32_t aExp, zExp;
5090     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5091     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5092 
5093     if (floatx80_invalid_encoding(a)) {
5094         float_raise(float_flag_invalid, status);
5095         return floatx80_default_nan(status);
5096     }
5097     aSig0 = extractFloatx80Frac( a );
5098     aExp = extractFloatx80Exp( a );
5099     aSign = extractFloatx80Sign( a );
5100     if ( aExp == 0x7FFF ) {
5101         if ((uint64_t)(aSig0 << 1)) {
5102             return propagateFloatx80NaN(a, a, status);
5103         }
5104         if ( ! aSign ) return a;
5105         goto invalid;
5106     }
5107     if ( aSign ) {
5108         if ( ( aExp | aSig0 ) == 0 ) return a;
5109  invalid:
5110         float_raise(float_flag_invalid, status);
5111         return floatx80_default_nan(status);
5112     }
5113     if ( aExp == 0 ) {
5114         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5115         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5116     }
5117     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5118     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5119     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5120     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5121     doubleZSig0 = zSig0<<1;
5122     mul64To128( zSig0, zSig0, &term0, &term1 );
5123     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5124     while ( (int64_t) rem0 < 0 ) {
5125         --zSig0;
5126         doubleZSig0 -= 2;
5127         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5128     }
5129     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5130     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5131         if ( zSig1 == 0 ) zSig1 = 1;
5132         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5133         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5134         mul64To128( zSig1, zSig1, &term2, &term3 );
5135         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5136         while ( (int64_t) rem1 < 0 ) {
5137             --zSig1;
5138             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5139             term3 |= 1;
5140             term2 |= doubleZSig0;
5141             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5142         }
5143         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5144     }
5145     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5146     zSig0 |= doubleZSig0;
5147     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5148                                 0, zExp, zSig0, zSig1, status);
5149 }
5150 
5151 /*----------------------------------------------------------------------------
5152 | Returns 1 if the extended double-precision floating-point value `a' is equal
5153 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5154 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5157 
5158 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5159 {
5160 
5161     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5162         || (extractFloatx80Exp(a) == 0x7FFF
5163             && (uint64_t) (extractFloatx80Frac(a) << 1))
5164         || (extractFloatx80Exp(b) == 0x7FFF
5165             && (uint64_t) (extractFloatx80Frac(b) << 1))
5166        ) {
5167         float_raise(float_flag_invalid, status);
5168         return 0;
5169     }
5170     return
5171            ( a.low == b.low )
5172         && (    ( a.high == b.high )
5173              || (    ( a.low == 0 )
5174                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5175            );
5176 
5177 }
5178 
5179 /*----------------------------------------------------------------------------
5180 | Returns 1 if the extended double-precision floating-point value `a' is
5181 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5182 | invalid exception is raised if either operand is a NaN.  The comparison is
5183 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5184 | Arithmetic.
5185 *----------------------------------------------------------------------------*/
5186 
5187 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5188 {
5189     flag aSign, bSign;
5190 
5191     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5192         || (extractFloatx80Exp(a) == 0x7FFF
5193             && (uint64_t) (extractFloatx80Frac(a) << 1))
5194         || (extractFloatx80Exp(b) == 0x7FFF
5195             && (uint64_t) (extractFloatx80Frac(b) << 1))
5196        ) {
5197         float_raise(float_flag_invalid, status);
5198         return 0;
5199     }
5200     aSign = extractFloatx80Sign( a );
5201     bSign = extractFloatx80Sign( b );
5202     if ( aSign != bSign ) {
5203         return
5204                aSign
5205             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5206                  == 0 );
5207     }
5208     return
5209           aSign ? le128( b.high, b.low, a.high, a.low )
5210         : le128( a.high, a.low, b.high, b.low );
5211 
5212 }
5213 
5214 /*----------------------------------------------------------------------------
5215 | Returns 1 if the extended double-precision floating-point value `a' is
5216 | less than the corresponding value `b', and 0 otherwise.  The invalid
5217 | exception is raised if either operand is a NaN.  The comparison is performed
5218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5219 *----------------------------------------------------------------------------*/
5220 
5221 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5222 {
5223     flag aSign, bSign;
5224 
5225     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5226         || (extractFloatx80Exp(a) == 0x7FFF
5227             && (uint64_t) (extractFloatx80Frac(a) << 1))
5228         || (extractFloatx80Exp(b) == 0x7FFF
5229             && (uint64_t) (extractFloatx80Frac(b) << 1))
5230        ) {
5231         float_raise(float_flag_invalid, status);
5232         return 0;
5233     }
5234     aSign = extractFloatx80Sign( a );
5235     bSign = extractFloatx80Sign( b );
5236     if ( aSign != bSign ) {
5237         return
5238                aSign
5239             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5240                  != 0 );
5241     }
5242     return
5243           aSign ? lt128( b.high, b.low, a.high, a.low )
5244         : lt128( a.high, a.low, b.high, b.low );
5245 
5246 }
5247 
5248 /*----------------------------------------------------------------------------
5249 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5250 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5251 | either operand is a NaN.   The comparison is performed according to the
5252 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5255 {
5256     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5257         || (extractFloatx80Exp(a) == 0x7FFF
5258             && (uint64_t) (extractFloatx80Frac(a) << 1))
5259         || (extractFloatx80Exp(b) == 0x7FFF
5260             && (uint64_t) (extractFloatx80Frac(b) << 1))
5261        ) {
5262         float_raise(float_flag_invalid, status);
5263         return 1;
5264     }
5265     return 0;
5266 }
5267 
5268 /*----------------------------------------------------------------------------
5269 | Returns 1 if the extended double-precision floating-point value `a' is
5270 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5271 | cause an exception.  The comparison is performed according to the IEC/IEEE
5272 | Standard for Binary Floating-Point Arithmetic.
5273 *----------------------------------------------------------------------------*/
5274 
5275 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5276 {
5277 
5278     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5279         float_raise(float_flag_invalid, status);
5280         return 0;
5281     }
5282     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5283               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5284          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5285               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5286        ) {
5287         if (floatx80_is_signaling_nan(a, status)
5288          || floatx80_is_signaling_nan(b, status)) {
5289             float_raise(float_flag_invalid, status);
5290         }
5291         return 0;
5292     }
5293     return
5294            ( a.low == b.low )
5295         && (    ( a.high == b.high )
5296              || (    ( a.low == 0 )
5297                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5298            );
5299 
5300 }
5301 
5302 /*----------------------------------------------------------------------------
5303 | Returns 1 if the extended double-precision floating-point value `a' is less
5304 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5305 | do not cause an exception.  Otherwise, the comparison is performed according
5306 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5307 *----------------------------------------------------------------------------*/
5308 
5309 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5310 {
5311     flag aSign, bSign;
5312 
5313     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5314         float_raise(float_flag_invalid, status);
5315         return 0;
5316     }
5317     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5318               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5319          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5320               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5321        ) {
5322         if (floatx80_is_signaling_nan(a, status)
5323          || floatx80_is_signaling_nan(b, status)) {
5324             float_raise(float_flag_invalid, status);
5325         }
5326         return 0;
5327     }
5328     aSign = extractFloatx80Sign( a );
5329     bSign = extractFloatx80Sign( b );
5330     if ( aSign != bSign ) {
5331         return
5332                aSign
5333             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5334                  == 0 );
5335     }
5336     return
5337           aSign ? le128( b.high, b.low, a.high, a.low )
5338         : le128( a.high, a.low, b.high, b.low );
5339 
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the extended double-precision floating-point value `a' is less
5344 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5345 | an exception.  Otherwise, the comparison is performed according to the
5346 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5348 
5349 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5350 {
5351     flag aSign, bSign;
5352 
5353     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5354         float_raise(float_flag_invalid, status);
5355         return 0;
5356     }
5357     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5358               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5359          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5360               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5361        ) {
5362         if (floatx80_is_signaling_nan(a, status)
5363          || floatx80_is_signaling_nan(b, status)) {
5364             float_raise(float_flag_invalid, status);
5365         }
5366         return 0;
5367     }
5368     aSign = extractFloatx80Sign( a );
5369     bSign = extractFloatx80Sign( b );
5370     if ( aSign != bSign ) {
5371         return
5372                aSign
5373             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5374                  != 0 );
5375     }
5376     return
5377           aSign ? lt128( b.high, b.low, a.high, a.low )
5378         : lt128( a.high, a.low, b.high, b.low );
5379 
5380 }
5381 
5382 /*----------------------------------------------------------------------------
5383 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5384 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5385 | The comparison is performed according to the IEC/IEEE Standard for Binary
5386 | Floating-Point Arithmetic.
5387 *----------------------------------------------------------------------------*/
5388 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5389 {
5390     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5391         float_raise(float_flag_invalid, status);
5392         return 1;
5393     }
5394     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5395               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5396          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5397               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5398        ) {
5399         if (floatx80_is_signaling_nan(a, status)
5400          || floatx80_is_signaling_nan(b, status)) {
5401             float_raise(float_flag_invalid, status);
5402         }
5403         return 1;
5404     }
5405     return 0;
5406 }
5407 
5408 /*----------------------------------------------------------------------------
5409 | Returns the result of converting the quadruple-precision floating-point
5410 | value `a' to the 32-bit two's complement integer format.  The conversion
5411 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5412 | Arithmetic---which means in particular that the conversion is rounded
5413 | according to the current rounding mode.  If `a' is a NaN, the largest
5414 | positive integer is returned.  Otherwise, if the conversion overflows, the
5415 | largest integer with the same sign as `a' is returned.
5416 *----------------------------------------------------------------------------*/
5417 
5418 int32_t float128_to_int32(float128 a, float_status *status)
5419 {
5420     flag aSign;
5421     int32_t aExp, shiftCount;
5422     uint64_t aSig0, aSig1;
5423 
5424     aSig1 = extractFloat128Frac1( a );
5425     aSig0 = extractFloat128Frac0( a );
5426     aExp = extractFloat128Exp( a );
5427     aSign = extractFloat128Sign( a );
5428     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5429     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5430     aSig0 |= ( aSig1 != 0 );
5431     shiftCount = 0x4028 - aExp;
5432     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5433     return roundAndPackInt32(aSign, aSig0, status);
5434 
5435 }
5436 
5437 /*----------------------------------------------------------------------------
5438 | Returns the result of converting the quadruple-precision floating-point
5439 | value `a' to the 32-bit two's complement integer format.  The conversion
5440 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5441 | Arithmetic, except that the conversion is always rounded toward zero.  If
5442 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5443 | conversion overflows, the largest integer with the same sign as `a' is
5444 | returned.
5445 *----------------------------------------------------------------------------*/
5446 
5447 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5448 {
5449     flag aSign;
5450     int32_t aExp, shiftCount;
5451     uint64_t aSig0, aSig1, savedASig;
5452     int32_t z;
5453 
5454     aSig1 = extractFloat128Frac1( a );
5455     aSig0 = extractFloat128Frac0( a );
5456     aExp = extractFloat128Exp( a );
5457     aSign = extractFloat128Sign( a );
5458     aSig0 |= ( aSig1 != 0 );
5459     if ( 0x401E < aExp ) {
5460         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5461         goto invalid;
5462     }
5463     else if ( aExp < 0x3FFF ) {
5464         if (aExp || aSig0) {
5465             status->float_exception_flags |= float_flag_inexact;
5466         }
5467         return 0;
5468     }
5469     aSig0 |= LIT64( 0x0001000000000000 );
5470     shiftCount = 0x402F - aExp;
5471     savedASig = aSig0;
5472     aSig0 >>= shiftCount;
5473     z = aSig0;
5474     if ( aSign ) z = - z;
5475     if ( ( z < 0 ) ^ aSign ) {
5476  invalid:
5477         float_raise(float_flag_invalid, status);
5478         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5479     }
5480     if ( ( aSig0<<shiftCount ) != savedASig ) {
5481         status->float_exception_flags |= float_flag_inexact;
5482     }
5483     return z;
5484 
5485 }
5486 
5487 /*----------------------------------------------------------------------------
5488 | Returns the result of converting the quadruple-precision floating-point
5489 | value `a' to the 64-bit two's complement integer format.  The conversion
5490 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5491 | Arithmetic---which means in particular that the conversion is rounded
5492 | according to the current rounding mode.  If `a' is a NaN, the largest
5493 | positive integer is returned.  Otherwise, if the conversion overflows, the
5494 | largest integer with the same sign as `a' is returned.
5495 *----------------------------------------------------------------------------*/
5496 
5497 int64_t float128_to_int64(float128 a, float_status *status)
5498 {
5499     flag aSign;
5500     int32_t aExp, shiftCount;
5501     uint64_t aSig0, aSig1;
5502 
5503     aSig1 = extractFloat128Frac1( a );
5504     aSig0 = extractFloat128Frac0( a );
5505     aExp = extractFloat128Exp( a );
5506     aSign = extractFloat128Sign( a );
5507     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5508     shiftCount = 0x402F - aExp;
5509     if ( shiftCount <= 0 ) {
5510         if ( 0x403E < aExp ) {
5511             float_raise(float_flag_invalid, status);
5512             if (    ! aSign
5513                  || (    ( aExp == 0x7FFF )
5514                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5515                     )
5516                ) {
5517                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5518             }
5519             return (int64_t) LIT64( 0x8000000000000000 );
5520         }
5521         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5522     }
5523     else {
5524         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5525     }
5526     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5527 
5528 }
5529 
5530 /*----------------------------------------------------------------------------
5531 | Returns the result of converting the quadruple-precision floating-point
5532 | value `a' to the 64-bit two's complement integer format.  The conversion
5533 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5534 | Arithmetic, except that the conversion is always rounded toward zero.
5535 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5536 | the conversion overflows, the largest integer with the same sign as `a' is
5537 | returned.
5538 *----------------------------------------------------------------------------*/
5539 
5540 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5541 {
5542     flag aSign;
5543     int32_t aExp, shiftCount;
5544     uint64_t aSig0, aSig1;
5545     int64_t z;
5546 
5547     aSig1 = extractFloat128Frac1( a );
5548     aSig0 = extractFloat128Frac0( a );
5549     aExp = extractFloat128Exp( a );
5550     aSign = extractFloat128Sign( a );
5551     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5552     shiftCount = aExp - 0x402F;
5553     if ( 0 < shiftCount ) {
5554         if ( 0x403E <= aExp ) {
5555             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5556             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5557                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5558                 if (aSig1) {
5559                     status->float_exception_flags |= float_flag_inexact;
5560                 }
5561             }
5562             else {
5563                 float_raise(float_flag_invalid, status);
5564                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5565                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5566                 }
5567             }
5568             return (int64_t) LIT64( 0x8000000000000000 );
5569         }
5570         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5571         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5572             status->float_exception_flags |= float_flag_inexact;
5573         }
5574     }
5575     else {
5576         if ( aExp < 0x3FFF ) {
5577             if ( aExp | aSig0 | aSig1 ) {
5578                 status->float_exception_flags |= float_flag_inexact;
5579             }
5580             return 0;
5581         }
5582         z = aSig0>>( - shiftCount );
5583         if (    aSig1
5584              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5585             status->float_exception_flags |= float_flag_inexact;
5586         }
5587     }
5588     if ( aSign ) z = - z;
5589     return z;
5590 
5591 }
5592 
5593 /*----------------------------------------------------------------------------
5594 | Returns the result of converting the quadruple-precision floating-point value
5595 | `a' to the 64-bit unsigned integer format.  The conversion is
5596 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5597 | Arithmetic---which means in particular that the conversion is rounded
5598 | according to the current rounding mode.  If `a' is a NaN, the largest
5599 | positive integer is returned.  If the conversion overflows, the
5600 | largest unsigned integer is returned.  If 'a' is negative, the value is
5601 | rounded and zero is returned; negative values that do not round to zero
5602 | will raise the inexact exception.
5603 *----------------------------------------------------------------------------*/
5604 
5605 uint64_t float128_to_uint64(float128 a, float_status *status)
5606 {
5607     flag aSign;
5608     int aExp;
5609     int shiftCount;
5610     uint64_t aSig0, aSig1;
5611 
5612     aSig0 = extractFloat128Frac0(a);
5613     aSig1 = extractFloat128Frac1(a);
5614     aExp = extractFloat128Exp(a);
5615     aSign = extractFloat128Sign(a);
5616     if (aSign && (aExp > 0x3FFE)) {
5617         float_raise(float_flag_invalid, status);
5618         if (float128_is_any_nan(a)) {
5619             return LIT64(0xFFFFFFFFFFFFFFFF);
5620         } else {
5621             return 0;
5622         }
5623     }
5624     if (aExp) {
5625         aSig0 |= LIT64(0x0001000000000000);
5626     }
5627     shiftCount = 0x402F - aExp;
5628     if (shiftCount <= 0) {
5629         if (0x403E < aExp) {
5630             float_raise(float_flag_invalid, status);
5631             return LIT64(0xFFFFFFFFFFFFFFFF);
5632         }
5633         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5634     } else {
5635         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5636     }
5637     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5638 }
5639 
5640 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5641 {
5642     uint64_t v;
5643     signed char current_rounding_mode = status->float_rounding_mode;
5644 
5645     set_float_rounding_mode(float_round_to_zero, status);
5646     v = float128_to_uint64(a, status);
5647     set_float_rounding_mode(current_rounding_mode, status);
5648 
5649     return v;
5650 }
5651 
5652 /*----------------------------------------------------------------------------
5653 | Returns the result of converting the quadruple-precision floating-point
5654 | value `a' to the 32-bit unsigned integer format.  The conversion
5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5656 | Arithmetic except that the conversion is always rounded toward zero.
5657 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5658 | if the conversion overflows, the largest unsigned integer is returned.
5659 | If 'a' is negative, the value is rounded and zero is returned; negative
5660 | values that do not round to zero will raise the inexact exception.
5661 *----------------------------------------------------------------------------*/
5662 
5663 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5664 {
5665     uint64_t v;
5666     uint32_t res;
5667     int old_exc_flags = get_float_exception_flags(status);
5668 
5669     v = float128_to_uint64_round_to_zero(a, status);
5670     if (v > 0xffffffff) {
5671         res = 0xffffffff;
5672     } else {
5673         return v;
5674     }
5675     set_float_exception_flags(old_exc_flags, status);
5676     float_raise(float_flag_invalid, status);
5677     return res;
5678 }
5679 
5680 /*----------------------------------------------------------------------------
5681 | Returns the result of converting the quadruple-precision floating-point
5682 | value `a' to the single-precision floating-point format.  The conversion
5683 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5684 | Arithmetic.
5685 *----------------------------------------------------------------------------*/
5686 
5687 float32 float128_to_float32(float128 a, float_status *status)
5688 {
5689     flag aSign;
5690     int32_t aExp;
5691     uint64_t aSig0, aSig1;
5692     uint32_t zSig;
5693 
5694     aSig1 = extractFloat128Frac1( a );
5695     aSig0 = extractFloat128Frac0( a );
5696     aExp = extractFloat128Exp( a );
5697     aSign = extractFloat128Sign( a );
5698     if ( aExp == 0x7FFF ) {
5699         if ( aSig0 | aSig1 ) {
5700             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
5701         }
5702         return packFloat32( aSign, 0xFF, 0 );
5703     }
5704     aSig0 |= ( aSig1 != 0 );
5705     shift64RightJamming( aSig0, 18, &aSig0 );
5706     zSig = aSig0;
5707     if ( aExp || zSig ) {
5708         zSig |= 0x40000000;
5709         aExp -= 0x3F81;
5710     }
5711     return roundAndPackFloat32(aSign, aExp, zSig, status);
5712 
5713 }
5714 
5715 /*----------------------------------------------------------------------------
5716 | Returns the result of converting the quadruple-precision floating-point
5717 | value `a' to the double-precision floating-point format.  The conversion
5718 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5719 | Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721 
5722 float64 float128_to_float64(float128 a, float_status *status)
5723 {
5724     flag aSign;
5725     int32_t aExp;
5726     uint64_t aSig0, aSig1;
5727 
5728     aSig1 = extractFloat128Frac1( a );
5729     aSig0 = extractFloat128Frac0( a );
5730     aExp = extractFloat128Exp( a );
5731     aSign = extractFloat128Sign( a );
5732     if ( aExp == 0x7FFF ) {
5733         if ( aSig0 | aSig1 ) {
5734             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
5735         }
5736         return packFloat64( aSign, 0x7FF, 0 );
5737     }
5738     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5739     aSig0 |= ( aSig1 != 0 );
5740     if ( aExp || aSig0 ) {
5741         aSig0 |= LIT64( 0x4000000000000000 );
5742         aExp -= 0x3C01;
5743     }
5744     return roundAndPackFloat64(aSign, aExp, aSig0, status);
5745 
5746 }
5747 
5748 /*----------------------------------------------------------------------------
5749 | Returns the result of converting the quadruple-precision floating-point
5750 | value `a' to the extended double-precision floating-point format.  The
5751 | conversion is performed according to the IEC/IEEE Standard for Binary
5752 | Floating-Point Arithmetic.
5753 *----------------------------------------------------------------------------*/
5754 
5755 floatx80 float128_to_floatx80(float128 a, float_status *status)
5756 {
5757     flag aSign;
5758     int32_t aExp;
5759     uint64_t aSig0, aSig1;
5760 
5761     aSig1 = extractFloat128Frac1( a );
5762     aSig0 = extractFloat128Frac0( a );
5763     aExp = extractFloat128Exp( a );
5764     aSign = extractFloat128Sign( a );
5765     if ( aExp == 0x7FFF ) {
5766         if ( aSig0 | aSig1 ) {
5767             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
5768         }
5769         return packFloatx80(aSign, floatx80_infinity_high,
5770                                    floatx80_infinity_low);
5771     }
5772     if ( aExp == 0 ) {
5773         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5774         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5775     }
5776     else {
5777         aSig0 |= LIT64( 0x0001000000000000 );
5778     }
5779     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5780     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
5781 
5782 }
5783 
5784 /*----------------------------------------------------------------------------
5785 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5786 | returns the result as a quadruple-precision floating-point value.  The
5787 | operation is performed according to the IEC/IEEE Standard for Binary
5788 | Floating-Point Arithmetic.
5789 *----------------------------------------------------------------------------*/
5790 
5791 float128 float128_round_to_int(float128 a, float_status *status)
5792 {
5793     flag aSign;
5794     int32_t aExp;
5795     uint64_t lastBitMask, roundBitsMask;
5796     float128 z;
5797 
5798     aExp = extractFloat128Exp( a );
5799     if ( 0x402F <= aExp ) {
5800         if ( 0x406F <= aExp ) {
5801             if (    ( aExp == 0x7FFF )
5802                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5803                ) {
5804                 return propagateFloat128NaN(a, a, status);
5805             }
5806             return a;
5807         }
5808         lastBitMask = 1;
5809         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5810         roundBitsMask = lastBitMask - 1;
5811         z = a;
5812         switch (status->float_rounding_mode) {
5813         case float_round_nearest_even:
5814             if ( lastBitMask ) {
5815                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5816                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5817             }
5818             else {
5819                 if ( (int64_t) z.low < 0 ) {
5820                     ++z.high;
5821                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5822                 }
5823             }
5824             break;
5825         case float_round_ties_away:
5826             if (lastBitMask) {
5827                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5828             } else {
5829                 if ((int64_t) z.low < 0) {
5830                     ++z.high;
5831                 }
5832             }
5833             break;
5834         case float_round_to_zero:
5835             break;
5836         case float_round_up:
5837             if (!extractFloat128Sign(z)) {
5838                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5839             }
5840             break;
5841         case float_round_down:
5842             if (extractFloat128Sign(z)) {
5843                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5844             }
5845             break;
5846         default:
5847             abort();
5848         }
5849         z.low &= ~ roundBitsMask;
5850     }
5851     else {
5852         if ( aExp < 0x3FFF ) {
5853             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5854             status->float_exception_flags |= float_flag_inexact;
5855             aSign = extractFloat128Sign( a );
5856             switch (status->float_rounding_mode) {
5857              case float_round_nearest_even:
5858                 if (    ( aExp == 0x3FFE )
5859                      && (   extractFloat128Frac0( a )
5860                           | extractFloat128Frac1( a ) )
5861                    ) {
5862                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5863                 }
5864                 break;
5865             case float_round_ties_away:
5866                 if (aExp == 0x3FFE) {
5867                     return packFloat128(aSign, 0x3FFF, 0, 0);
5868                 }
5869                 break;
5870              case float_round_down:
5871                 return
5872                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5873                     : packFloat128( 0, 0, 0, 0 );
5874              case float_round_up:
5875                 return
5876                       aSign ? packFloat128( 1, 0, 0, 0 )
5877                     : packFloat128( 0, 0x3FFF, 0, 0 );
5878             }
5879             return packFloat128( aSign, 0, 0, 0 );
5880         }
5881         lastBitMask = 1;
5882         lastBitMask <<= 0x402F - aExp;
5883         roundBitsMask = lastBitMask - 1;
5884         z.low = 0;
5885         z.high = a.high;
5886         switch (status->float_rounding_mode) {
5887         case float_round_nearest_even:
5888             z.high += lastBitMask>>1;
5889             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5890                 z.high &= ~ lastBitMask;
5891             }
5892             break;
5893         case float_round_ties_away:
5894             z.high += lastBitMask>>1;
5895             break;
5896         case float_round_to_zero:
5897             break;
5898         case float_round_up:
5899             if (!extractFloat128Sign(z)) {
5900                 z.high |= ( a.low != 0 );
5901                 z.high += roundBitsMask;
5902             }
5903             break;
5904         case float_round_down:
5905             if (extractFloat128Sign(z)) {
5906                 z.high |= (a.low != 0);
5907                 z.high += roundBitsMask;
5908             }
5909             break;
5910         default:
5911             abort();
5912         }
5913         z.high &= ~ roundBitsMask;
5914     }
5915     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5916         status->float_exception_flags |= float_flag_inexact;
5917     }
5918     return z;
5919 
5920 }
5921 
5922 /*----------------------------------------------------------------------------
5923 | Returns the result of adding the absolute values of the quadruple-precision
5924 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5925 | before being returned.  `zSign' is ignored if the result is a NaN.
5926 | The addition is performed according to the IEC/IEEE Standard for Binary
5927 | Floating-Point Arithmetic.
5928 *----------------------------------------------------------------------------*/
5929 
5930 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
5931                                 float_status *status)
5932 {
5933     int32_t aExp, bExp, zExp;
5934     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5935     int32_t expDiff;
5936 
5937     aSig1 = extractFloat128Frac1( a );
5938     aSig0 = extractFloat128Frac0( a );
5939     aExp = extractFloat128Exp( a );
5940     bSig1 = extractFloat128Frac1( b );
5941     bSig0 = extractFloat128Frac0( b );
5942     bExp = extractFloat128Exp( b );
5943     expDiff = aExp - bExp;
5944     if ( 0 < expDiff ) {
5945         if ( aExp == 0x7FFF ) {
5946             if (aSig0 | aSig1) {
5947                 return propagateFloat128NaN(a, b, status);
5948             }
5949             return a;
5950         }
5951         if ( bExp == 0 ) {
5952             --expDiff;
5953         }
5954         else {
5955             bSig0 |= LIT64( 0x0001000000000000 );
5956         }
5957         shift128ExtraRightJamming(
5958             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5959         zExp = aExp;
5960     }
5961     else if ( expDiff < 0 ) {
5962         if ( bExp == 0x7FFF ) {
5963             if (bSig0 | bSig1) {
5964                 return propagateFloat128NaN(a, b, status);
5965             }
5966             return packFloat128( zSign, 0x7FFF, 0, 0 );
5967         }
5968         if ( aExp == 0 ) {
5969             ++expDiff;
5970         }
5971         else {
5972             aSig0 |= LIT64( 0x0001000000000000 );
5973         }
5974         shift128ExtraRightJamming(
5975             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5976         zExp = bExp;
5977     }
5978     else {
5979         if ( aExp == 0x7FFF ) {
5980             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5981                 return propagateFloat128NaN(a, b, status);
5982             }
5983             return a;
5984         }
5985         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5986         if ( aExp == 0 ) {
5987             if (status->flush_to_zero) {
5988                 if (zSig0 | zSig1) {
5989                     float_raise(float_flag_output_denormal, status);
5990                 }
5991                 return packFloat128(zSign, 0, 0, 0);
5992             }
5993             return packFloat128( zSign, 0, zSig0, zSig1 );
5994         }
5995         zSig2 = 0;
5996         zSig0 |= LIT64( 0x0002000000000000 );
5997         zExp = aExp;
5998         goto shiftRight1;
5999     }
6000     aSig0 |= LIT64( 0x0001000000000000 );
6001     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6002     --zExp;
6003     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6004     ++zExp;
6005  shiftRight1:
6006     shift128ExtraRightJamming(
6007         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6008  roundAndPack:
6009     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6010 
6011 }
6012 
6013 /*----------------------------------------------------------------------------
6014 | Returns the result of subtracting the absolute values of the quadruple-
6015 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6016 | difference is negated before being returned.  `zSign' is ignored if the
6017 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6018 | Standard for Binary Floating-Point Arithmetic.
6019 *----------------------------------------------------------------------------*/
6020 
6021 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6022                                 float_status *status)
6023 {
6024     int32_t aExp, bExp, zExp;
6025     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6026     int32_t expDiff;
6027 
6028     aSig1 = extractFloat128Frac1( a );
6029     aSig0 = extractFloat128Frac0( a );
6030     aExp = extractFloat128Exp( a );
6031     bSig1 = extractFloat128Frac1( b );
6032     bSig0 = extractFloat128Frac0( b );
6033     bExp = extractFloat128Exp( b );
6034     expDiff = aExp - bExp;
6035     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6036     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6037     if ( 0 < expDiff ) goto aExpBigger;
6038     if ( expDiff < 0 ) goto bExpBigger;
6039     if ( aExp == 0x7FFF ) {
6040         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6041             return propagateFloat128NaN(a, b, status);
6042         }
6043         float_raise(float_flag_invalid, status);
6044         return float128_default_nan(status);
6045     }
6046     if ( aExp == 0 ) {
6047         aExp = 1;
6048         bExp = 1;
6049     }
6050     if ( bSig0 < aSig0 ) goto aBigger;
6051     if ( aSig0 < bSig0 ) goto bBigger;
6052     if ( bSig1 < aSig1 ) goto aBigger;
6053     if ( aSig1 < bSig1 ) goto bBigger;
6054     return packFloat128(status->float_rounding_mode == float_round_down,
6055                         0, 0, 0);
6056  bExpBigger:
6057     if ( bExp == 0x7FFF ) {
6058         if (bSig0 | bSig1) {
6059             return propagateFloat128NaN(a, b, status);
6060         }
6061         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6062     }
6063     if ( aExp == 0 ) {
6064         ++expDiff;
6065     }
6066     else {
6067         aSig0 |= LIT64( 0x4000000000000000 );
6068     }
6069     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6070     bSig0 |= LIT64( 0x4000000000000000 );
6071  bBigger:
6072     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6073     zExp = bExp;
6074     zSign ^= 1;
6075     goto normalizeRoundAndPack;
6076  aExpBigger:
6077     if ( aExp == 0x7FFF ) {
6078         if (aSig0 | aSig1) {
6079             return propagateFloat128NaN(a, b, status);
6080         }
6081         return a;
6082     }
6083     if ( bExp == 0 ) {
6084         --expDiff;
6085     }
6086     else {
6087         bSig0 |= LIT64( 0x4000000000000000 );
6088     }
6089     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6090     aSig0 |= LIT64( 0x4000000000000000 );
6091  aBigger:
6092     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6093     zExp = aExp;
6094  normalizeRoundAndPack:
6095     --zExp;
6096     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6097                                          status);
6098 
6099 }
6100 
6101 /*----------------------------------------------------------------------------
6102 | Returns the result of adding the quadruple-precision floating-point values
6103 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6104 | for Binary Floating-Point Arithmetic.
6105 *----------------------------------------------------------------------------*/
6106 
6107 float128 float128_add(float128 a, float128 b, float_status *status)
6108 {
6109     flag aSign, bSign;
6110 
6111     aSign = extractFloat128Sign( a );
6112     bSign = extractFloat128Sign( b );
6113     if ( aSign == bSign ) {
6114         return addFloat128Sigs(a, b, aSign, status);
6115     }
6116     else {
6117         return subFloat128Sigs(a, b, aSign, status);
6118     }
6119 
6120 }
6121 
6122 /*----------------------------------------------------------------------------
6123 | Returns the result of subtracting the quadruple-precision floating-point
6124 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6125 | Standard for Binary Floating-Point Arithmetic.
6126 *----------------------------------------------------------------------------*/
6127 
6128 float128 float128_sub(float128 a, float128 b, float_status *status)
6129 {
6130     flag aSign, bSign;
6131 
6132     aSign = extractFloat128Sign( a );
6133     bSign = extractFloat128Sign( b );
6134     if ( aSign == bSign ) {
6135         return subFloat128Sigs(a, b, aSign, status);
6136     }
6137     else {
6138         return addFloat128Sigs(a, b, aSign, status);
6139     }
6140 
6141 }
6142 
6143 /*----------------------------------------------------------------------------
6144 | Returns the result of multiplying the quadruple-precision floating-point
6145 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6146 | Standard for Binary Floating-Point Arithmetic.
6147 *----------------------------------------------------------------------------*/
6148 
6149 float128 float128_mul(float128 a, float128 b, float_status *status)
6150 {
6151     flag aSign, bSign, zSign;
6152     int32_t aExp, bExp, zExp;
6153     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6154 
6155     aSig1 = extractFloat128Frac1( a );
6156     aSig0 = extractFloat128Frac0( a );
6157     aExp = extractFloat128Exp( a );
6158     aSign = extractFloat128Sign( a );
6159     bSig1 = extractFloat128Frac1( b );
6160     bSig0 = extractFloat128Frac0( b );
6161     bExp = extractFloat128Exp( b );
6162     bSign = extractFloat128Sign( b );
6163     zSign = aSign ^ bSign;
6164     if ( aExp == 0x7FFF ) {
6165         if (    ( aSig0 | aSig1 )
6166              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6167             return propagateFloat128NaN(a, b, status);
6168         }
6169         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6170         return packFloat128( zSign, 0x7FFF, 0, 0 );
6171     }
6172     if ( bExp == 0x7FFF ) {
6173         if (bSig0 | bSig1) {
6174             return propagateFloat128NaN(a, b, status);
6175         }
6176         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6177  invalid:
6178             float_raise(float_flag_invalid, status);
6179             return float128_default_nan(status);
6180         }
6181         return packFloat128( zSign, 0x7FFF, 0, 0 );
6182     }
6183     if ( aExp == 0 ) {
6184         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6185         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6186     }
6187     if ( bExp == 0 ) {
6188         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6189         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6190     }
6191     zExp = aExp + bExp - 0x4000;
6192     aSig0 |= LIT64( 0x0001000000000000 );
6193     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6194     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6195     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6196     zSig2 |= ( zSig3 != 0 );
6197     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6198         shift128ExtraRightJamming(
6199             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6200         ++zExp;
6201     }
6202     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6203 
6204 }
6205 
6206 /*----------------------------------------------------------------------------
6207 | Returns the result of dividing the quadruple-precision floating-point value
6208 | `a' by the corresponding value `b'.  The operation is performed according to
6209 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6210 *----------------------------------------------------------------------------*/
6211 
6212 float128 float128_div(float128 a, float128 b, float_status *status)
6213 {
6214     flag aSign, bSign, zSign;
6215     int32_t aExp, bExp, zExp;
6216     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6217     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6218 
6219     aSig1 = extractFloat128Frac1( a );
6220     aSig0 = extractFloat128Frac0( a );
6221     aExp = extractFloat128Exp( a );
6222     aSign = extractFloat128Sign( a );
6223     bSig1 = extractFloat128Frac1( b );
6224     bSig0 = extractFloat128Frac0( b );
6225     bExp = extractFloat128Exp( b );
6226     bSign = extractFloat128Sign( b );
6227     zSign = aSign ^ bSign;
6228     if ( aExp == 0x7FFF ) {
6229         if (aSig0 | aSig1) {
6230             return propagateFloat128NaN(a, b, status);
6231         }
6232         if ( bExp == 0x7FFF ) {
6233             if (bSig0 | bSig1) {
6234                 return propagateFloat128NaN(a, b, status);
6235             }
6236             goto invalid;
6237         }
6238         return packFloat128( zSign, 0x7FFF, 0, 0 );
6239     }
6240     if ( bExp == 0x7FFF ) {
6241         if (bSig0 | bSig1) {
6242             return propagateFloat128NaN(a, b, status);
6243         }
6244         return packFloat128( zSign, 0, 0, 0 );
6245     }
6246     if ( bExp == 0 ) {
6247         if ( ( bSig0 | bSig1 ) == 0 ) {
6248             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6249  invalid:
6250                 float_raise(float_flag_invalid, status);
6251                 return float128_default_nan(status);
6252             }
6253             float_raise(float_flag_divbyzero, status);
6254             return packFloat128( zSign, 0x7FFF, 0, 0 );
6255         }
6256         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6257     }
6258     if ( aExp == 0 ) {
6259         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6260         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6261     }
6262     zExp = aExp - bExp + 0x3FFD;
6263     shortShift128Left(
6264         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6265     shortShift128Left(
6266         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6267     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6268         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6269         ++zExp;
6270     }
6271     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6272     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6273     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6274     while ( (int64_t) rem0 < 0 ) {
6275         --zSig0;
6276         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6277     }
6278     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6279     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6280         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6281         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6282         while ( (int64_t) rem1 < 0 ) {
6283             --zSig1;
6284             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6285         }
6286         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6287     }
6288     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6289     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6290 
6291 }
6292 
6293 /*----------------------------------------------------------------------------
6294 | Returns the remainder of the quadruple-precision floating-point value `a'
6295 | with respect to the corresponding value `b'.  The operation is performed
6296 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6297 *----------------------------------------------------------------------------*/
6298 
6299 float128 float128_rem(float128 a, float128 b, float_status *status)
6300 {
6301     flag aSign, zSign;
6302     int32_t aExp, bExp, expDiff;
6303     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6304     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6305     int64_t sigMean0;
6306 
6307     aSig1 = extractFloat128Frac1( a );
6308     aSig0 = extractFloat128Frac0( a );
6309     aExp = extractFloat128Exp( a );
6310     aSign = extractFloat128Sign( a );
6311     bSig1 = extractFloat128Frac1( b );
6312     bSig0 = extractFloat128Frac0( b );
6313     bExp = extractFloat128Exp( b );
6314     if ( aExp == 0x7FFF ) {
6315         if (    ( aSig0 | aSig1 )
6316              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6317             return propagateFloat128NaN(a, b, status);
6318         }
6319         goto invalid;
6320     }
6321     if ( bExp == 0x7FFF ) {
6322         if (bSig0 | bSig1) {
6323             return propagateFloat128NaN(a, b, status);
6324         }
6325         return a;
6326     }
6327     if ( bExp == 0 ) {
6328         if ( ( bSig0 | bSig1 ) == 0 ) {
6329  invalid:
6330             float_raise(float_flag_invalid, status);
6331             return float128_default_nan(status);
6332         }
6333         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6334     }
6335     if ( aExp == 0 ) {
6336         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6337         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6338     }
6339     expDiff = aExp - bExp;
6340     if ( expDiff < -1 ) return a;
6341     shortShift128Left(
6342         aSig0 | LIT64( 0x0001000000000000 ),
6343         aSig1,
6344         15 - ( expDiff < 0 ),
6345         &aSig0,
6346         &aSig1
6347     );
6348     shortShift128Left(
6349         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6350     q = le128( bSig0, bSig1, aSig0, aSig1 );
6351     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6352     expDiff -= 64;
6353     while ( 0 < expDiff ) {
6354         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6355         q = ( 4 < q ) ? q - 4 : 0;
6356         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6357         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6358         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6359         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6360         expDiff -= 61;
6361     }
6362     if ( -64 < expDiff ) {
6363         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6364         q = ( 4 < q ) ? q - 4 : 0;
6365         q >>= - expDiff;
6366         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6367         expDiff += 52;
6368         if ( expDiff < 0 ) {
6369             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6370         }
6371         else {
6372             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6373         }
6374         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6375         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6376     }
6377     else {
6378         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6379         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6380     }
6381     do {
6382         alternateASig0 = aSig0;
6383         alternateASig1 = aSig1;
6384         ++q;
6385         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6386     } while ( 0 <= (int64_t) aSig0 );
6387     add128(
6388         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6389     if (    ( sigMean0 < 0 )
6390          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6391         aSig0 = alternateASig0;
6392         aSig1 = alternateASig1;
6393     }
6394     zSign = ( (int64_t) aSig0 < 0 );
6395     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6396     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6397                                          status);
6398 }
6399 
6400 /*----------------------------------------------------------------------------
6401 | Returns the square root of the quadruple-precision floating-point value `a'.
6402 | The operation is performed according to the IEC/IEEE Standard for Binary
6403 | Floating-Point Arithmetic.
6404 *----------------------------------------------------------------------------*/
6405 
6406 float128 float128_sqrt(float128 a, float_status *status)
6407 {
6408     flag aSign;
6409     int32_t aExp, zExp;
6410     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6411     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6412 
6413     aSig1 = extractFloat128Frac1( a );
6414     aSig0 = extractFloat128Frac0( a );
6415     aExp = extractFloat128Exp( a );
6416     aSign = extractFloat128Sign( a );
6417     if ( aExp == 0x7FFF ) {
6418         if (aSig0 | aSig1) {
6419             return propagateFloat128NaN(a, a, status);
6420         }
6421         if ( ! aSign ) return a;
6422         goto invalid;
6423     }
6424     if ( aSign ) {
6425         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6426  invalid:
6427         float_raise(float_flag_invalid, status);
6428         return float128_default_nan(status);
6429     }
6430     if ( aExp == 0 ) {
6431         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6432         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6433     }
6434     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6435     aSig0 |= LIT64( 0x0001000000000000 );
6436     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6437     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6438     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6439     doubleZSig0 = zSig0<<1;
6440     mul64To128( zSig0, zSig0, &term0, &term1 );
6441     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6442     while ( (int64_t) rem0 < 0 ) {
6443         --zSig0;
6444         doubleZSig0 -= 2;
6445         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6446     }
6447     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6448     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6449         if ( zSig1 == 0 ) zSig1 = 1;
6450         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6451         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6452         mul64To128( zSig1, zSig1, &term2, &term3 );
6453         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6454         while ( (int64_t) rem1 < 0 ) {
6455             --zSig1;
6456             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6457             term3 |= 1;
6458             term2 |= doubleZSig0;
6459             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6460         }
6461         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6462     }
6463     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6464     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6465 
6466 }
6467 
6468 /*----------------------------------------------------------------------------
6469 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6470 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6471 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6472 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6473 *----------------------------------------------------------------------------*/
6474 
6475 int float128_eq(float128 a, float128 b, float_status *status)
6476 {
6477 
6478     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6479               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6480          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6481               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6482        ) {
6483         float_raise(float_flag_invalid, status);
6484         return 0;
6485     }
6486     return
6487            ( a.low == b.low )
6488         && (    ( a.high == b.high )
6489              || (    ( a.low == 0 )
6490                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6491            );
6492 
6493 }
6494 
6495 /*----------------------------------------------------------------------------
6496 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6497 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6498 | exception is raised if either operand is a NaN.  The comparison is performed
6499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6500 *----------------------------------------------------------------------------*/
6501 
6502 int float128_le(float128 a, float128 b, float_status *status)
6503 {
6504     flag aSign, bSign;
6505 
6506     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6507               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6508          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6509               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6510        ) {
6511         float_raise(float_flag_invalid, status);
6512         return 0;
6513     }
6514     aSign = extractFloat128Sign( a );
6515     bSign = extractFloat128Sign( b );
6516     if ( aSign != bSign ) {
6517         return
6518                aSign
6519             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6520                  == 0 );
6521     }
6522     return
6523           aSign ? le128( b.high, b.low, a.high, a.low )
6524         : le128( a.high, a.low, b.high, b.low );
6525 
6526 }
6527 
6528 /*----------------------------------------------------------------------------
6529 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6530 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6531 | raised if either operand is a NaN.  The comparison is performed according
6532 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6533 *----------------------------------------------------------------------------*/
6534 
6535 int float128_lt(float128 a, float128 b, float_status *status)
6536 {
6537     flag aSign, bSign;
6538 
6539     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6540               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6541          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6542               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6543        ) {
6544         float_raise(float_flag_invalid, status);
6545         return 0;
6546     }
6547     aSign = extractFloat128Sign( a );
6548     bSign = extractFloat128Sign( b );
6549     if ( aSign != bSign ) {
6550         return
6551                aSign
6552             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6553                  != 0 );
6554     }
6555     return
6556           aSign ? lt128( b.high, b.low, a.high, a.low )
6557         : lt128( a.high, a.low, b.high, b.low );
6558 
6559 }
6560 
6561 /*----------------------------------------------------------------------------
6562 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6563 | be compared, and 0 otherwise.  The invalid exception is raised if either
6564 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6565 | Standard for Binary Floating-Point Arithmetic.
6566 *----------------------------------------------------------------------------*/
6567 
6568 int float128_unordered(float128 a, float128 b, float_status *status)
6569 {
6570     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6571               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6572          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6573               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6574        ) {
6575         float_raise(float_flag_invalid, status);
6576         return 1;
6577     }
6578     return 0;
6579 }
6580 
6581 /*----------------------------------------------------------------------------
6582 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6583 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6584 | exception.  The comparison is performed according to the IEC/IEEE Standard
6585 | for Binary Floating-Point Arithmetic.
6586 *----------------------------------------------------------------------------*/
6587 
6588 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6589 {
6590 
6591     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6592               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6593          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6594               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6595        ) {
6596         if (float128_is_signaling_nan(a, status)
6597          || float128_is_signaling_nan(b, status)) {
6598             float_raise(float_flag_invalid, status);
6599         }
6600         return 0;
6601     }
6602     return
6603            ( a.low == b.low )
6604         && (    ( a.high == b.high )
6605              || (    ( a.low == 0 )
6606                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6607            );
6608 
6609 }
6610 
6611 /*----------------------------------------------------------------------------
6612 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6613 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6614 | cause an exception.  Otherwise, the comparison is performed according to the
6615 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6616 *----------------------------------------------------------------------------*/
6617 
6618 int float128_le_quiet(float128 a, float128 b, float_status *status)
6619 {
6620     flag aSign, bSign;
6621 
6622     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6623               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6624          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6625               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6626        ) {
6627         if (float128_is_signaling_nan(a, status)
6628          || float128_is_signaling_nan(b, status)) {
6629             float_raise(float_flag_invalid, status);
6630         }
6631         return 0;
6632     }
6633     aSign = extractFloat128Sign( a );
6634     bSign = extractFloat128Sign( b );
6635     if ( aSign != bSign ) {
6636         return
6637                aSign
6638             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6639                  == 0 );
6640     }
6641     return
6642           aSign ? le128( b.high, b.low, a.high, a.low )
6643         : le128( a.high, a.low, b.high, b.low );
6644 
6645 }
6646 
6647 /*----------------------------------------------------------------------------
6648 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6649 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6650 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6651 | Standard for Binary Floating-Point Arithmetic.
6652 *----------------------------------------------------------------------------*/
6653 
6654 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6655 {
6656     flag aSign, bSign;
6657 
6658     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6659               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6660          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6661               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6662        ) {
6663         if (float128_is_signaling_nan(a, status)
6664          || float128_is_signaling_nan(b, status)) {
6665             float_raise(float_flag_invalid, status);
6666         }
6667         return 0;
6668     }
6669     aSign = extractFloat128Sign( a );
6670     bSign = extractFloat128Sign( b );
6671     if ( aSign != bSign ) {
6672         return
6673                aSign
6674             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6675                  != 0 );
6676     }
6677     return
6678           aSign ? lt128( b.high, b.low, a.high, a.low )
6679         : lt128( a.high, a.low, b.high, b.low );
6680 
6681 }
6682 
6683 /*----------------------------------------------------------------------------
6684 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6685 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6686 | comparison is performed according to the IEC/IEEE Standard for Binary
6687 | Floating-Point Arithmetic.
6688 *----------------------------------------------------------------------------*/
6689 
6690 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
6691 {
6692     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6693               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6694          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6695               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6696        ) {
6697         if (float128_is_signaling_nan(a, status)
6698          || float128_is_signaling_nan(b, status)) {
6699             float_raise(float_flag_invalid, status);
6700         }
6701         return 1;
6702     }
6703     return 0;
6704 }
6705 
6706 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
6707                                             int is_quiet, float_status *status)
6708 {
6709     flag aSign, bSign;
6710 
6711     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6712         float_raise(float_flag_invalid, status);
6713         return float_relation_unordered;
6714     }
6715     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6716           ( extractFloatx80Frac( a )<<1 ) ) ||
6717         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6718           ( extractFloatx80Frac( b )<<1 ) )) {
6719         if (!is_quiet ||
6720             floatx80_is_signaling_nan(a, status) ||
6721             floatx80_is_signaling_nan(b, status)) {
6722             float_raise(float_flag_invalid, status);
6723         }
6724         return float_relation_unordered;
6725     }
6726     aSign = extractFloatx80Sign( a );
6727     bSign = extractFloatx80Sign( b );
6728     if ( aSign != bSign ) {
6729 
6730         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6731              ( ( a.low | b.low ) == 0 ) ) {
6732             /* zero case */
6733             return float_relation_equal;
6734         } else {
6735             return 1 - (2 * aSign);
6736         }
6737     } else {
6738         if (a.low == b.low && a.high == b.high) {
6739             return float_relation_equal;
6740         } else {
6741             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6742         }
6743     }
6744 }
6745 
6746 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
6747 {
6748     return floatx80_compare_internal(a, b, 0, status);
6749 }
6750 
6751 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
6752 {
6753     return floatx80_compare_internal(a, b, 1, status);
6754 }
6755 
6756 static inline int float128_compare_internal(float128 a, float128 b,
6757                                             int is_quiet, float_status *status)
6758 {
6759     flag aSign, bSign;
6760 
6761     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6762           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6763         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6764           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6765         if (!is_quiet ||
6766             float128_is_signaling_nan(a, status) ||
6767             float128_is_signaling_nan(b, status)) {
6768             float_raise(float_flag_invalid, status);
6769         }
6770         return float_relation_unordered;
6771     }
6772     aSign = extractFloat128Sign( a );
6773     bSign = extractFloat128Sign( b );
6774     if ( aSign != bSign ) {
6775         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6776             /* zero case */
6777             return float_relation_equal;
6778         } else {
6779             return 1 - (2 * aSign);
6780         }
6781     } else {
6782         if (a.low == b.low && a.high == b.high) {
6783             return float_relation_equal;
6784         } else {
6785             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6786         }
6787     }
6788 }
6789 
6790 int float128_compare(float128 a, float128 b, float_status *status)
6791 {
6792     return float128_compare_internal(a, b, 0, status);
6793 }
6794 
6795 int float128_compare_quiet(float128 a, float128 b, float_status *status)
6796 {
6797     return float128_compare_internal(a, b, 1, status);
6798 }
6799 
6800 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
6801 {
6802     flag aSign;
6803     int32_t aExp;
6804     uint64_t aSig;
6805 
6806     if (floatx80_invalid_encoding(a)) {
6807         float_raise(float_flag_invalid, status);
6808         return floatx80_default_nan(status);
6809     }
6810     aSig = extractFloatx80Frac( a );
6811     aExp = extractFloatx80Exp( a );
6812     aSign = extractFloatx80Sign( a );
6813 
6814     if ( aExp == 0x7FFF ) {
6815         if ( aSig<<1 ) {
6816             return propagateFloatx80NaN(a, a, status);
6817         }
6818         return a;
6819     }
6820 
6821     if (aExp == 0) {
6822         if (aSig == 0) {
6823             return a;
6824         }
6825         aExp++;
6826     }
6827 
6828     if (n > 0x10000) {
6829         n = 0x10000;
6830     } else if (n < -0x10000) {
6831         n = -0x10000;
6832     }
6833 
6834     aExp += n;
6835     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
6836                                          aSign, aExp, aSig, 0, status);
6837 }
6838 
6839 float128 float128_scalbn(float128 a, int n, float_status *status)
6840 {
6841     flag aSign;
6842     int32_t aExp;
6843     uint64_t aSig0, aSig1;
6844 
6845     aSig1 = extractFloat128Frac1( a );
6846     aSig0 = extractFloat128Frac0( a );
6847     aExp = extractFloat128Exp( a );
6848     aSign = extractFloat128Sign( a );
6849     if ( aExp == 0x7FFF ) {
6850         if ( aSig0 | aSig1 ) {
6851             return propagateFloat128NaN(a, a, status);
6852         }
6853         return a;
6854     }
6855     if (aExp != 0) {
6856         aSig0 |= LIT64( 0x0001000000000000 );
6857     } else if (aSig0 == 0 && aSig1 == 0) {
6858         return a;
6859     } else {
6860         aExp++;
6861     }
6862 
6863     if (n > 0x10000) {
6864         n = 0x10000;
6865     } else if (n < -0x10000) {
6866         n = -0x10000;
6867     }
6868 
6869     aExp += n - 1;
6870     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6871                                          , status);
6872 
6873 }
6874