xref: /openbmc/qemu/fpu/softfloat.c (revision e6d34aee)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include "qemu/bitops.h"
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "fpu/softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Returns the fraction bits of the half-precision floating-point value `a'.
100 *----------------------------------------------------------------------------*/
101 
102 static inline uint32_t extractFloat16Frac(float16 a)
103 {
104     return float16_val(a) & 0x3ff;
105 }
106 
107 /*----------------------------------------------------------------------------
108 | Returns the exponent bits of the half-precision floating-point value `a'.
109 *----------------------------------------------------------------------------*/
110 
111 static inline int extractFloat16Exp(float16 a)
112 {
113     return (float16_val(a) >> 10) & 0x1f;
114 }
115 
116 /*----------------------------------------------------------------------------
117 | Returns the fraction bits of the single-precision floating-point value `a'.
118 *----------------------------------------------------------------------------*/
119 
120 static inline uint32_t extractFloat32Frac(float32 a)
121 {
122     return float32_val(a) & 0x007FFFFF;
123 }
124 
125 /*----------------------------------------------------------------------------
126 | Returns the exponent bits of the single-precision floating-point value `a'.
127 *----------------------------------------------------------------------------*/
128 
129 static inline int extractFloat32Exp(float32 a)
130 {
131     return (float32_val(a) >> 23) & 0xFF;
132 }
133 
134 /*----------------------------------------------------------------------------
135 | Returns the sign bit of the single-precision floating-point value `a'.
136 *----------------------------------------------------------------------------*/
137 
138 static inline flag extractFloat32Sign(float32 a)
139 {
140     return float32_val(a) >> 31;
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Returns the fraction bits of the double-precision floating-point value `a'.
145 *----------------------------------------------------------------------------*/
146 
147 static inline uint64_t extractFloat64Frac(float64 a)
148 {
149     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
150 }
151 
152 /*----------------------------------------------------------------------------
153 | Returns the exponent bits of the double-precision floating-point value `a'.
154 *----------------------------------------------------------------------------*/
155 
156 static inline int extractFloat64Exp(float64 a)
157 {
158     return (float64_val(a) >> 52) & 0x7FF;
159 }
160 
161 /*----------------------------------------------------------------------------
162 | Returns the sign bit of the double-precision floating-point value `a'.
163 *----------------------------------------------------------------------------*/
164 
165 static inline flag extractFloat64Sign(float64 a)
166 {
167     return float64_val(a) >> 63;
168 }
169 
170 /*
171  * Classify a floating point number. Everything above float_class_qnan
172  * is a NaN so cls >= float_class_qnan is any NaN.
173  */
174 
175 typedef enum __attribute__ ((__packed__)) {
176     float_class_unclassified,
177     float_class_zero,
178     float_class_normal,
179     float_class_inf,
180     float_class_qnan,  /* all NaNs from here */
181     float_class_snan,
182 } FloatClass;
183 
184 /* Simple helpers for checking if, or what kind of, NaN we have */
185 static inline __attribute__((unused)) bool is_nan(FloatClass c)
186 {
187     return unlikely(c >= float_class_qnan);
188 }
189 
190 static inline __attribute__((unused)) bool is_snan(FloatClass c)
191 {
192     return c == float_class_snan;
193 }
194 
195 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
196 {
197     return c == float_class_qnan;
198 }
199 
200 /*
201  * Structure holding all of the decomposed parts of a float. The
202  * exponent is unbiased and the fraction is normalized. All
203  * calculations are done with a 64 bit fraction and then rounded as
204  * appropriate for the final format.
205  *
206  * Thanks to the packed FloatClass a decent compiler should be able to
207  * fit the whole structure into registers and avoid using the stack
208  * for parameter passing.
209  */
210 
211 typedef struct {
212     uint64_t frac;
213     int32_t  exp;
214     FloatClass cls;
215     bool sign;
216 } FloatParts;
217 
218 #define DECOMPOSED_BINARY_POINT    (64 - 2)
219 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
220 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
221 
222 /* Structure holding all of the relevant parameters for a format.
223  *   exp_size: the size of the exponent field
224  *   exp_bias: the offset applied to the exponent field
225  *   exp_max: the maximum normalised exponent
226  *   frac_size: the size of the fraction field
227  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
228  * The following are computed based the size of fraction
229  *   frac_lsb: least significant bit of fraction
230  *   frac_lsbm1: the bit below the least significant bit (for rounding)
231  *   round_mask/roundeven_mask: masks used for rounding
232  * The following optional modifiers are available:
233  *   arm_althp: handle ARM Alternative Half Precision
234  */
235 typedef struct {
236     int exp_size;
237     int exp_bias;
238     int exp_max;
239     int frac_size;
240     int frac_shift;
241     uint64_t frac_lsb;
242     uint64_t frac_lsbm1;
243     uint64_t round_mask;
244     uint64_t roundeven_mask;
245     bool arm_althp;
246 } FloatFmt;
247 
248 /* Expand fields based on the size of exponent and fraction */
249 #define FLOAT_PARAMS(E, F)                                           \
250     .exp_size       = E,                                             \
251     .exp_bias       = ((1 << E) - 1) >> 1,                           \
252     .exp_max        = (1 << E) - 1,                                  \
253     .frac_size      = F,                                             \
254     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
255     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
256     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
257     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
258     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
259 
260 static const FloatFmt float16_params = {
261     FLOAT_PARAMS(5, 10)
262 };
263 
264 static const FloatFmt float16_params_ahp = {
265     FLOAT_PARAMS(5, 10),
266     .arm_althp = true
267 };
268 
269 static const FloatFmt float32_params = {
270     FLOAT_PARAMS(8, 23)
271 };
272 
273 static const FloatFmt float64_params = {
274     FLOAT_PARAMS(11, 52)
275 };
276 
277 /* Unpack a float to parts, but do not canonicalize.  */
278 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
279 {
280     const int sign_pos = fmt.frac_size + fmt.exp_size;
281 
282     return (FloatParts) {
283         .cls = float_class_unclassified,
284         .sign = extract64(raw, sign_pos, 1),
285         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
286         .frac = extract64(raw, 0, fmt.frac_size),
287     };
288 }
289 
290 static inline FloatParts float16_unpack_raw(float16 f)
291 {
292     return unpack_raw(float16_params, f);
293 }
294 
295 static inline FloatParts float32_unpack_raw(float32 f)
296 {
297     return unpack_raw(float32_params, f);
298 }
299 
300 static inline FloatParts float64_unpack_raw(float64 f)
301 {
302     return unpack_raw(float64_params, f);
303 }
304 
305 /* Pack a float from parts, but do not canonicalize.  */
306 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
307 {
308     const int sign_pos = fmt.frac_size + fmt.exp_size;
309     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
310     return deposit64(ret, sign_pos, 1, p.sign);
311 }
312 
313 static inline float16 float16_pack_raw(FloatParts p)
314 {
315     return make_float16(pack_raw(float16_params, p));
316 }
317 
318 static inline float32 float32_pack_raw(FloatParts p)
319 {
320     return make_float32(pack_raw(float32_params, p));
321 }
322 
323 static inline float64 float64_pack_raw(FloatParts p)
324 {
325     return make_float64(pack_raw(float64_params, p));
326 }
327 
328 /*----------------------------------------------------------------------------
329 | Functions and definitions to determine:  (1) whether tininess for underflow
330 | is detected before or after rounding by default, (2) what (if anything)
331 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
332 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
333 | are propagated from function inputs to output.  These details are target-
334 | specific.
335 *----------------------------------------------------------------------------*/
336 #include "softfloat-specialize.h"
337 
338 /* Canonicalize EXP and FRAC, setting CLS.  */
339 static FloatParts canonicalize(FloatParts part, const FloatFmt *parm,
340                                float_status *status)
341 {
342     if (part.exp == parm->exp_max && !parm->arm_althp) {
343         if (part.frac == 0) {
344             part.cls = float_class_inf;
345         } else {
346             part.frac <<= parm->frac_shift;
347             part.cls = (parts_is_snan_frac(part.frac, status)
348                         ? float_class_snan : float_class_qnan);
349         }
350     } else if (part.exp == 0) {
351         if (likely(part.frac == 0)) {
352             part.cls = float_class_zero;
353         } else if (status->flush_inputs_to_zero) {
354             float_raise(float_flag_input_denormal, status);
355             part.cls = float_class_zero;
356             part.frac = 0;
357         } else {
358             int shift = clz64(part.frac) - 1;
359             part.cls = float_class_normal;
360             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
361             part.frac <<= shift;
362         }
363     } else {
364         part.cls = float_class_normal;
365         part.exp -= parm->exp_bias;
366         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
367     }
368     return part;
369 }
370 
371 /* Round and uncanonicalize a floating-point number by parts. There
372  * are FRAC_SHIFT bits that may require rounding at the bottom of the
373  * fraction; these bits will be removed. The exponent will be biased
374  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
375  */
376 
377 static FloatParts round_canonical(FloatParts p, float_status *s,
378                                   const FloatFmt *parm)
379 {
380     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
381     const uint64_t round_mask = parm->round_mask;
382     const uint64_t roundeven_mask = parm->roundeven_mask;
383     const int exp_max = parm->exp_max;
384     const int frac_shift = parm->frac_shift;
385     uint64_t frac, inc;
386     int exp, flags = 0;
387     bool overflow_norm;
388 
389     frac = p.frac;
390     exp = p.exp;
391 
392     switch (p.cls) {
393     case float_class_normal:
394         switch (s->float_rounding_mode) {
395         case float_round_nearest_even:
396             overflow_norm = false;
397             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
398             break;
399         case float_round_ties_away:
400             overflow_norm = false;
401             inc = frac_lsbm1;
402             break;
403         case float_round_to_zero:
404             overflow_norm = true;
405             inc = 0;
406             break;
407         case float_round_up:
408             inc = p.sign ? 0 : round_mask;
409             overflow_norm = p.sign;
410             break;
411         case float_round_down:
412             inc = p.sign ? round_mask : 0;
413             overflow_norm = !p.sign;
414             break;
415         default:
416             g_assert_not_reached();
417         }
418 
419         exp += parm->exp_bias;
420         if (likely(exp > 0)) {
421             if (frac & round_mask) {
422                 flags |= float_flag_inexact;
423                 frac += inc;
424                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
425                     frac >>= 1;
426                     exp++;
427                 }
428             }
429             frac >>= frac_shift;
430 
431             if (parm->arm_althp) {
432                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
433                 if (unlikely(exp > exp_max)) {
434                     /* Overflow.  Return the maximum normal.  */
435                     flags = float_flag_invalid;
436                     exp = exp_max;
437                     frac = -1;
438                 }
439             } else if (unlikely(exp >= exp_max)) {
440                 flags |= float_flag_overflow | float_flag_inexact;
441                 if (overflow_norm) {
442                     exp = exp_max - 1;
443                     frac = -1;
444                 } else {
445                     p.cls = float_class_inf;
446                     goto do_inf;
447                 }
448             }
449         } else if (s->flush_to_zero) {
450             flags |= float_flag_output_denormal;
451             p.cls = float_class_zero;
452             goto do_zero;
453         } else {
454             bool is_tiny = (s->float_detect_tininess
455                             == float_tininess_before_rounding)
456                         || (exp < 0)
457                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
458 
459             shift64RightJamming(frac, 1 - exp, &frac);
460             if (frac & round_mask) {
461                 /* Need to recompute round-to-even.  */
462                 if (s->float_rounding_mode == float_round_nearest_even) {
463                     inc = ((frac & roundeven_mask) != frac_lsbm1
464                            ? frac_lsbm1 : 0);
465                 }
466                 flags |= float_flag_inexact;
467                 frac += inc;
468             }
469 
470             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
471             frac >>= frac_shift;
472 
473             if (is_tiny && (flags & float_flag_inexact)) {
474                 flags |= float_flag_underflow;
475             }
476             if (exp == 0 && frac == 0) {
477                 p.cls = float_class_zero;
478             }
479         }
480         break;
481 
482     case float_class_zero:
483     do_zero:
484         exp = 0;
485         frac = 0;
486         break;
487 
488     case float_class_inf:
489     do_inf:
490         assert(!parm->arm_althp);
491         exp = exp_max;
492         frac = 0;
493         break;
494 
495     case float_class_qnan:
496     case float_class_snan:
497         assert(!parm->arm_althp);
498         exp = exp_max;
499         frac >>= parm->frac_shift;
500         break;
501 
502     default:
503         g_assert_not_reached();
504     }
505 
506     float_raise(flags, s);
507     p.exp = exp;
508     p.frac = frac;
509     return p;
510 }
511 
512 /* Explicit FloatFmt version */
513 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
514                                             const FloatFmt *params)
515 {
516     return canonicalize(float16_unpack_raw(f), params, s);
517 }
518 
519 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
520 {
521     return float16a_unpack_canonical(f, s, &float16_params);
522 }
523 
524 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
525                                              const FloatFmt *params)
526 {
527     return float16_pack_raw(round_canonical(p, s, params));
528 }
529 
530 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
531 {
532     return float16a_round_pack_canonical(p, s, &float16_params);
533 }
534 
535 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
536 {
537     return canonicalize(float32_unpack_raw(f), &float32_params, s);
538 }
539 
540 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
541 {
542     return float32_pack_raw(round_canonical(p, s, &float32_params));
543 }
544 
545 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
546 {
547     return canonicalize(float64_unpack_raw(f), &float64_params, s);
548 }
549 
550 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
551 {
552     return float64_pack_raw(round_canonical(p, s, &float64_params));
553 }
554 
555 static FloatParts return_nan(FloatParts a, float_status *s)
556 {
557     switch (a.cls) {
558     case float_class_snan:
559         s->float_exception_flags |= float_flag_invalid;
560         a = parts_silence_nan(a, s);
561         /* fall through */
562     case float_class_qnan:
563         if (s->default_nan_mode) {
564             return parts_default_nan(s);
565         }
566         break;
567 
568     default:
569         g_assert_not_reached();
570     }
571     return a;
572 }
573 
574 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
575 {
576     if (is_snan(a.cls) || is_snan(b.cls)) {
577         s->float_exception_flags |= float_flag_invalid;
578     }
579 
580     if (s->default_nan_mode) {
581         return parts_default_nan(s);
582     } else {
583         if (pickNaN(a.cls, b.cls,
584                     a.frac > b.frac ||
585                     (a.frac == b.frac && a.sign < b.sign))) {
586             a = b;
587         }
588         if (is_snan(a.cls)) {
589             return parts_silence_nan(a, s);
590         }
591     }
592     return a;
593 }
594 
595 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
596                                   bool inf_zero, float_status *s)
597 {
598     int which;
599 
600     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
601         s->float_exception_flags |= float_flag_invalid;
602     }
603 
604     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
605 
606     if (s->default_nan_mode) {
607         /* Note that this check is after pickNaNMulAdd so that function
608          * has an opportunity to set the Invalid flag.
609          */
610         which = 3;
611     }
612 
613     switch (which) {
614     case 0:
615         break;
616     case 1:
617         a = b;
618         break;
619     case 2:
620         a = c;
621         break;
622     case 3:
623         return parts_default_nan(s);
624     default:
625         g_assert_not_reached();
626     }
627 
628     if (is_snan(a.cls)) {
629         return parts_silence_nan(a, s);
630     }
631     return a;
632 }
633 
634 /*
635  * Returns the result of adding or subtracting the values of the
636  * floating-point values `a' and `b'. The operation is performed
637  * according to the IEC/IEEE Standard for Binary Floating-Point
638  * Arithmetic.
639  */
640 
641 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
642                                 float_status *s)
643 {
644     bool a_sign = a.sign;
645     bool b_sign = b.sign ^ subtract;
646 
647     if (a_sign != b_sign) {
648         /* Subtraction */
649 
650         if (a.cls == float_class_normal && b.cls == float_class_normal) {
651             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
652                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
653                 a.frac = a.frac - b.frac;
654             } else {
655                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
656                 a.frac = b.frac - a.frac;
657                 a.exp = b.exp;
658                 a_sign ^= 1;
659             }
660 
661             if (a.frac == 0) {
662                 a.cls = float_class_zero;
663                 a.sign = s->float_rounding_mode == float_round_down;
664             } else {
665                 int shift = clz64(a.frac) - 1;
666                 a.frac = a.frac << shift;
667                 a.exp = a.exp - shift;
668                 a.sign = a_sign;
669             }
670             return a;
671         }
672         if (is_nan(a.cls) || is_nan(b.cls)) {
673             return pick_nan(a, b, s);
674         }
675         if (a.cls == float_class_inf) {
676             if (b.cls == float_class_inf) {
677                 float_raise(float_flag_invalid, s);
678                 return parts_default_nan(s);
679             }
680             return a;
681         }
682         if (a.cls == float_class_zero && b.cls == float_class_zero) {
683             a.sign = s->float_rounding_mode == float_round_down;
684             return a;
685         }
686         if (a.cls == float_class_zero || b.cls == float_class_inf) {
687             b.sign = a_sign ^ 1;
688             return b;
689         }
690         if (b.cls == float_class_zero) {
691             return a;
692         }
693     } else {
694         /* Addition */
695         if (a.cls == float_class_normal && b.cls == float_class_normal) {
696             if (a.exp > b.exp) {
697                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
698             } else if (a.exp < b.exp) {
699                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
700                 a.exp = b.exp;
701             }
702             a.frac += b.frac;
703             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
704                 shift64RightJamming(a.frac, 1, &a.frac);
705                 a.exp += 1;
706             }
707             return a;
708         }
709         if (is_nan(a.cls) || is_nan(b.cls)) {
710             return pick_nan(a, b, s);
711         }
712         if (a.cls == float_class_inf || b.cls == float_class_zero) {
713             return a;
714         }
715         if (b.cls == float_class_inf || a.cls == float_class_zero) {
716             b.sign = b_sign;
717             return b;
718         }
719     }
720     g_assert_not_reached();
721 }
722 
723 /*
724  * Returns the result of adding or subtracting the floating-point
725  * values `a' and `b'. The operation is performed according to the
726  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
727  */
728 
729 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
730 {
731     FloatParts pa = float16_unpack_canonical(a, status);
732     FloatParts pb = float16_unpack_canonical(b, status);
733     FloatParts pr = addsub_floats(pa, pb, false, status);
734 
735     return float16_round_pack_canonical(pr, status);
736 }
737 
738 float32 QEMU_FLATTEN float32_add(float32 a, float32 b, float_status *status)
739 {
740     FloatParts pa = float32_unpack_canonical(a, status);
741     FloatParts pb = float32_unpack_canonical(b, status);
742     FloatParts pr = addsub_floats(pa, pb, false, status);
743 
744     return float32_round_pack_canonical(pr, status);
745 }
746 
747 float64 QEMU_FLATTEN float64_add(float64 a, float64 b, float_status *status)
748 {
749     FloatParts pa = float64_unpack_canonical(a, status);
750     FloatParts pb = float64_unpack_canonical(b, status);
751     FloatParts pr = addsub_floats(pa, pb, false, status);
752 
753     return float64_round_pack_canonical(pr, status);
754 }
755 
756 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
757 {
758     FloatParts pa = float16_unpack_canonical(a, status);
759     FloatParts pb = float16_unpack_canonical(b, status);
760     FloatParts pr = addsub_floats(pa, pb, true, status);
761 
762     return float16_round_pack_canonical(pr, status);
763 }
764 
765 float32 QEMU_FLATTEN float32_sub(float32 a, float32 b, float_status *status)
766 {
767     FloatParts pa = float32_unpack_canonical(a, status);
768     FloatParts pb = float32_unpack_canonical(b, status);
769     FloatParts pr = addsub_floats(pa, pb, true, status);
770 
771     return float32_round_pack_canonical(pr, status);
772 }
773 
774 float64 QEMU_FLATTEN float64_sub(float64 a, float64 b, float_status *status)
775 {
776     FloatParts pa = float64_unpack_canonical(a, status);
777     FloatParts pb = float64_unpack_canonical(b, status);
778     FloatParts pr = addsub_floats(pa, pb, true, status);
779 
780     return float64_round_pack_canonical(pr, status);
781 }
782 
783 /*
784  * Returns the result of multiplying the floating-point values `a' and
785  * `b'. The operation is performed according to the IEC/IEEE Standard
786  * for Binary Floating-Point Arithmetic.
787  */
788 
789 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
790 {
791     bool sign = a.sign ^ b.sign;
792 
793     if (a.cls == float_class_normal && b.cls == float_class_normal) {
794         uint64_t hi, lo;
795         int exp = a.exp + b.exp;
796 
797         mul64To128(a.frac, b.frac, &hi, &lo);
798         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
799         if (lo & DECOMPOSED_OVERFLOW_BIT) {
800             shift64RightJamming(lo, 1, &lo);
801             exp += 1;
802         }
803 
804         /* Re-use a */
805         a.exp = exp;
806         a.sign = sign;
807         a.frac = lo;
808         return a;
809     }
810     /* handle all the NaN cases */
811     if (is_nan(a.cls) || is_nan(b.cls)) {
812         return pick_nan(a, b, s);
813     }
814     /* Inf * Zero == NaN */
815     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
816         (a.cls == float_class_zero && b.cls == float_class_inf)) {
817         s->float_exception_flags |= float_flag_invalid;
818         return parts_default_nan(s);
819     }
820     /* Multiply by 0 or Inf */
821     if (a.cls == float_class_inf || a.cls == float_class_zero) {
822         a.sign = sign;
823         return a;
824     }
825     if (b.cls == float_class_inf || b.cls == float_class_zero) {
826         b.sign = sign;
827         return b;
828     }
829     g_assert_not_reached();
830 }
831 
832 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
833 {
834     FloatParts pa = float16_unpack_canonical(a, status);
835     FloatParts pb = float16_unpack_canonical(b, status);
836     FloatParts pr = mul_floats(pa, pb, status);
837 
838     return float16_round_pack_canonical(pr, status);
839 }
840 
841 float32 QEMU_FLATTEN float32_mul(float32 a, float32 b, float_status *status)
842 {
843     FloatParts pa = float32_unpack_canonical(a, status);
844     FloatParts pb = float32_unpack_canonical(b, status);
845     FloatParts pr = mul_floats(pa, pb, status);
846 
847     return float32_round_pack_canonical(pr, status);
848 }
849 
850 float64 QEMU_FLATTEN float64_mul(float64 a, float64 b, float_status *status)
851 {
852     FloatParts pa = float64_unpack_canonical(a, status);
853     FloatParts pb = float64_unpack_canonical(b, status);
854     FloatParts pr = mul_floats(pa, pb, status);
855 
856     return float64_round_pack_canonical(pr, status);
857 }
858 
859 /*
860  * Returns the result of multiplying the floating-point values `a' and
861  * `b' then adding 'c', with no intermediate rounding step after the
862  * multiplication. The operation is performed according to the
863  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
864  * The flags argument allows the caller to select negation of the
865  * addend, the intermediate product, or the final result. (The
866  * difference between this and having the caller do a separate
867  * negation is that negating externally will flip the sign bit on
868  * NaNs.)
869  */
870 
871 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
872                                 int flags, float_status *s)
873 {
874     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
875                     ((1 << float_class_inf) | (1 << float_class_zero));
876     bool p_sign;
877     bool sign_flip = flags & float_muladd_negate_result;
878     FloatClass p_class;
879     uint64_t hi, lo;
880     int p_exp;
881 
882     /* It is implementation-defined whether the cases of (0,inf,qnan)
883      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
884      * they return if they do), so we have to hand this information
885      * off to the target-specific pick-a-NaN routine.
886      */
887     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
888         return pick_nan_muladd(a, b, c, inf_zero, s);
889     }
890 
891     if (inf_zero) {
892         s->float_exception_flags |= float_flag_invalid;
893         return parts_default_nan(s);
894     }
895 
896     if (flags & float_muladd_negate_c) {
897         c.sign ^= 1;
898     }
899 
900     p_sign = a.sign ^ b.sign;
901 
902     if (flags & float_muladd_negate_product) {
903         p_sign ^= 1;
904     }
905 
906     if (a.cls == float_class_inf || b.cls == float_class_inf) {
907         p_class = float_class_inf;
908     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
909         p_class = float_class_zero;
910     } else {
911         p_class = float_class_normal;
912     }
913 
914     if (c.cls == float_class_inf) {
915         if (p_class == float_class_inf && p_sign != c.sign) {
916             s->float_exception_flags |= float_flag_invalid;
917             return parts_default_nan(s);
918         } else {
919             a.cls = float_class_inf;
920             a.sign = c.sign ^ sign_flip;
921             return a;
922         }
923     }
924 
925     if (p_class == float_class_inf) {
926         a.cls = float_class_inf;
927         a.sign = p_sign ^ sign_flip;
928         return a;
929     }
930 
931     if (p_class == float_class_zero) {
932         if (c.cls == float_class_zero) {
933             if (p_sign != c.sign) {
934                 p_sign = s->float_rounding_mode == float_round_down;
935             }
936             c.sign = p_sign;
937         } else if (flags & float_muladd_halve_result) {
938             c.exp -= 1;
939         }
940         c.sign ^= sign_flip;
941         return c;
942     }
943 
944     /* a & b should be normals now... */
945     assert(a.cls == float_class_normal &&
946            b.cls == float_class_normal);
947 
948     p_exp = a.exp + b.exp;
949 
950     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
951      * result.
952      */
953     mul64To128(a.frac, b.frac, &hi, &lo);
954     /* binary point now at bit 124 */
955 
956     /* check for overflow */
957     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
958         shift128RightJamming(hi, lo, 1, &hi, &lo);
959         p_exp += 1;
960     }
961 
962     /* + add/sub */
963     if (c.cls == float_class_zero) {
964         /* move binary point back to 62 */
965         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
966     } else {
967         int exp_diff = p_exp - c.exp;
968         if (p_sign == c.sign) {
969             /* Addition */
970             if (exp_diff <= 0) {
971                 shift128RightJamming(hi, lo,
972                                      DECOMPOSED_BINARY_POINT - exp_diff,
973                                      &hi, &lo);
974                 lo += c.frac;
975                 p_exp = c.exp;
976             } else {
977                 uint64_t c_hi, c_lo;
978                 /* shift c to the same binary point as the product (124) */
979                 c_hi = c.frac >> 2;
980                 c_lo = 0;
981                 shift128RightJamming(c_hi, c_lo,
982                                      exp_diff,
983                                      &c_hi, &c_lo);
984                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
985                 /* move binary point back to 62 */
986                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
987             }
988 
989             if (lo & DECOMPOSED_OVERFLOW_BIT) {
990                 shift64RightJamming(lo, 1, &lo);
991                 p_exp += 1;
992             }
993 
994         } else {
995             /* Subtraction */
996             uint64_t c_hi, c_lo;
997             /* make C binary point match product at bit 124 */
998             c_hi = c.frac >> 2;
999             c_lo = 0;
1000 
1001             if (exp_diff <= 0) {
1002                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1003                 if (exp_diff == 0
1004                     &&
1005                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1006                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1007                 } else {
1008                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1009                     p_sign ^= 1;
1010                     p_exp = c.exp;
1011                 }
1012             } else {
1013                 shift128RightJamming(c_hi, c_lo,
1014                                      exp_diff,
1015                                      &c_hi, &c_lo);
1016                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1017             }
1018 
1019             if (hi == 0 && lo == 0) {
1020                 a.cls = float_class_zero;
1021                 a.sign = s->float_rounding_mode == float_round_down;
1022                 a.sign ^= sign_flip;
1023                 return a;
1024             } else {
1025                 int shift;
1026                 if (hi != 0) {
1027                     shift = clz64(hi);
1028                 } else {
1029                     shift = clz64(lo) + 64;
1030                 }
1031                 /* Normalizing to a binary point of 124 is the
1032                    correct adjust for the exponent.  However since we're
1033                    shifting, we might as well put the binary point back
1034                    at 62 where we really want it.  Therefore shift as
1035                    if we're leaving 1 bit at the top of the word, but
1036                    adjust the exponent as if we're leaving 3 bits.  */
1037                 shift -= 1;
1038                 if (shift >= 64) {
1039                     lo = lo << (shift - 64);
1040                 } else {
1041                     hi = (hi << shift) | (lo >> (64 - shift));
1042                     lo = hi | ((lo << shift) != 0);
1043                 }
1044                 p_exp -= shift - 2;
1045             }
1046         }
1047     }
1048 
1049     if (flags & float_muladd_halve_result) {
1050         p_exp -= 1;
1051     }
1052 
1053     /* finally prepare our result */
1054     a.cls = float_class_normal;
1055     a.sign = p_sign ^ sign_flip;
1056     a.exp = p_exp;
1057     a.frac = lo;
1058 
1059     return a;
1060 }
1061 
1062 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1063                                                 int flags, float_status *status)
1064 {
1065     FloatParts pa = float16_unpack_canonical(a, status);
1066     FloatParts pb = float16_unpack_canonical(b, status);
1067     FloatParts pc = float16_unpack_canonical(c, status);
1068     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1069 
1070     return float16_round_pack_canonical(pr, status);
1071 }
1072 
1073 float32 QEMU_FLATTEN float32_muladd(float32 a, float32 b, float32 c,
1074                                                 int flags, float_status *status)
1075 {
1076     FloatParts pa = float32_unpack_canonical(a, status);
1077     FloatParts pb = float32_unpack_canonical(b, status);
1078     FloatParts pc = float32_unpack_canonical(c, status);
1079     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1080 
1081     return float32_round_pack_canonical(pr, status);
1082 }
1083 
1084 float64 QEMU_FLATTEN float64_muladd(float64 a, float64 b, float64 c,
1085                                                 int flags, float_status *status)
1086 {
1087     FloatParts pa = float64_unpack_canonical(a, status);
1088     FloatParts pb = float64_unpack_canonical(b, status);
1089     FloatParts pc = float64_unpack_canonical(c, status);
1090     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1091 
1092     return float64_round_pack_canonical(pr, status);
1093 }
1094 
1095 /*
1096  * Returns the result of dividing the floating-point value `a' by the
1097  * corresponding value `b'. The operation is performed according to
1098  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1099  */
1100 
1101 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1102 {
1103     bool sign = a.sign ^ b.sign;
1104 
1105     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1106         uint64_t n0, n1, q, r;
1107         int exp = a.exp - b.exp;
1108 
1109         /*
1110          * We want a 2*N / N-bit division to produce exactly an N-bit
1111          * result, so that we do not lose any precision and so that we
1112          * do not have to renormalize afterward.  If A.frac < B.frac,
1113          * then division would produce an (N-1)-bit result; shift A left
1114          * by one to produce the an N-bit result, and decrement the
1115          * exponent to match.
1116          *
1117          * The udiv_qrnnd algorithm that we're using requires normalization,
1118          * i.e. the msb of the denominator must be set.  Since we know that
1119          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1120          * by one (more), and the remainder must be shifted right by one.
1121          */
1122         if (a.frac < b.frac) {
1123             exp -= 1;
1124             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1125         } else {
1126             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1127         }
1128         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1129 
1130         /*
1131          * Set lsb if there is a remainder, to set inexact.
1132          * As mentioned above, to find the actual value of the remainder we
1133          * would need to shift right, but (1) we are only concerned about
1134          * non-zero-ness, and (2) the remainder will always be even because
1135          * both inputs to the division primitive are even.
1136          */
1137         a.frac = q | (r != 0);
1138         a.sign = sign;
1139         a.exp = exp;
1140         return a;
1141     }
1142     /* handle all the NaN cases */
1143     if (is_nan(a.cls) || is_nan(b.cls)) {
1144         return pick_nan(a, b, s);
1145     }
1146     /* 0/0 or Inf/Inf */
1147     if (a.cls == b.cls
1148         &&
1149         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1150         s->float_exception_flags |= float_flag_invalid;
1151         return parts_default_nan(s);
1152     }
1153     /* Inf / x or 0 / x */
1154     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1155         a.sign = sign;
1156         return a;
1157     }
1158     /* Div 0 => Inf */
1159     if (b.cls == float_class_zero) {
1160         s->float_exception_flags |= float_flag_divbyzero;
1161         a.cls = float_class_inf;
1162         a.sign = sign;
1163         return a;
1164     }
1165     /* Div by Inf */
1166     if (b.cls == float_class_inf) {
1167         a.cls = float_class_zero;
1168         a.sign = sign;
1169         return a;
1170     }
1171     g_assert_not_reached();
1172 }
1173 
1174 float16 float16_div(float16 a, float16 b, float_status *status)
1175 {
1176     FloatParts pa = float16_unpack_canonical(a, status);
1177     FloatParts pb = float16_unpack_canonical(b, status);
1178     FloatParts pr = div_floats(pa, pb, status);
1179 
1180     return float16_round_pack_canonical(pr, status);
1181 }
1182 
1183 float32 float32_div(float32 a, float32 b, float_status *status)
1184 {
1185     FloatParts pa = float32_unpack_canonical(a, status);
1186     FloatParts pb = float32_unpack_canonical(b, status);
1187     FloatParts pr = div_floats(pa, pb, status);
1188 
1189     return float32_round_pack_canonical(pr, status);
1190 }
1191 
1192 float64 float64_div(float64 a, float64 b, float_status *status)
1193 {
1194     FloatParts pa = float64_unpack_canonical(a, status);
1195     FloatParts pb = float64_unpack_canonical(b, status);
1196     FloatParts pr = div_floats(pa, pb, status);
1197 
1198     return float64_round_pack_canonical(pr, status);
1199 }
1200 
1201 /*
1202  * Float to Float conversions
1203  *
1204  * Returns the result of converting one float format to another. The
1205  * conversion is performed according to the IEC/IEEE Standard for
1206  * Binary Floating-Point Arithmetic.
1207  *
1208  * The float_to_float helper only needs to take care of raising
1209  * invalid exceptions and handling the conversion on NaNs.
1210  */
1211 
1212 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1213                                  float_status *s)
1214 {
1215     if (dstf->arm_althp) {
1216         switch (a.cls) {
1217         case float_class_qnan:
1218         case float_class_snan:
1219             /* There is no NaN in the destination format.  Raise Invalid
1220              * and return a zero with the sign of the input NaN.
1221              */
1222             s->float_exception_flags |= float_flag_invalid;
1223             a.cls = float_class_zero;
1224             a.frac = 0;
1225             a.exp = 0;
1226             break;
1227 
1228         case float_class_inf:
1229             /* There is no Inf in the destination format.  Raise Invalid
1230              * and return the maximum normal with the correct sign.
1231              */
1232             s->float_exception_flags |= float_flag_invalid;
1233             a.cls = float_class_normal;
1234             a.exp = dstf->exp_max;
1235             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1236             break;
1237 
1238         default:
1239             break;
1240         }
1241     } else if (is_nan(a.cls)) {
1242         if (is_snan(a.cls)) {
1243             s->float_exception_flags |= float_flag_invalid;
1244             a = parts_silence_nan(a, s);
1245         }
1246         if (s->default_nan_mode) {
1247             return parts_default_nan(s);
1248         }
1249     }
1250     return a;
1251 }
1252 
1253 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1254 {
1255     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1256     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1257     FloatParts pr = float_to_float(p, &float32_params, s);
1258     return float32_round_pack_canonical(pr, s);
1259 }
1260 
1261 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1262 {
1263     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1264     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1265     FloatParts pr = float_to_float(p, &float64_params, s);
1266     return float64_round_pack_canonical(pr, s);
1267 }
1268 
1269 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1270 {
1271     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1272     FloatParts p = float32_unpack_canonical(a, s);
1273     FloatParts pr = float_to_float(p, fmt16, s);
1274     return float16a_round_pack_canonical(pr, s, fmt16);
1275 }
1276 
1277 float64 float32_to_float64(float32 a, float_status *s)
1278 {
1279     FloatParts p = float32_unpack_canonical(a, s);
1280     FloatParts pr = float_to_float(p, &float64_params, s);
1281     return float64_round_pack_canonical(pr, s);
1282 }
1283 
1284 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1285 {
1286     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1287     FloatParts p = float64_unpack_canonical(a, s);
1288     FloatParts pr = float_to_float(p, fmt16, s);
1289     return float16a_round_pack_canonical(pr, s, fmt16);
1290 }
1291 
1292 float32 float64_to_float32(float64 a, float_status *s)
1293 {
1294     FloatParts p = float64_unpack_canonical(a, s);
1295     FloatParts pr = float_to_float(p, &float32_params, s);
1296     return float32_round_pack_canonical(pr, s);
1297 }
1298 
1299 /*
1300  * Rounds the floating-point value `a' to an integer, and returns the
1301  * result as a floating-point value. The operation is performed
1302  * according to the IEC/IEEE Standard for Binary Floating-Point
1303  * Arithmetic.
1304  */
1305 
1306 static FloatParts round_to_int(FloatParts a, int rmode,
1307                                int scale, float_status *s)
1308 {
1309     switch (a.cls) {
1310     case float_class_qnan:
1311     case float_class_snan:
1312         return return_nan(a, s);
1313 
1314     case float_class_zero:
1315     case float_class_inf:
1316         /* already "integral" */
1317         break;
1318 
1319     case float_class_normal:
1320         scale = MIN(MAX(scale, -0x10000), 0x10000);
1321         a.exp += scale;
1322 
1323         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1324             /* already integral */
1325             break;
1326         }
1327         if (a.exp < 0) {
1328             bool one;
1329             /* all fractional */
1330             s->float_exception_flags |= float_flag_inexact;
1331             switch (rmode) {
1332             case float_round_nearest_even:
1333                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1334                 break;
1335             case float_round_ties_away:
1336                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1337                 break;
1338             case float_round_to_zero:
1339                 one = false;
1340                 break;
1341             case float_round_up:
1342                 one = !a.sign;
1343                 break;
1344             case float_round_down:
1345                 one = a.sign;
1346                 break;
1347             default:
1348                 g_assert_not_reached();
1349             }
1350 
1351             if (one) {
1352                 a.frac = DECOMPOSED_IMPLICIT_BIT;
1353                 a.exp = 0;
1354             } else {
1355                 a.cls = float_class_zero;
1356             }
1357         } else {
1358             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
1359             uint64_t frac_lsbm1 = frac_lsb >> 1;
1360             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
1361             uint64_t rnd_mask = rnd_even_mask >> 1;
1362             uint64_t inc;
1363 
1364             switch (rmode) {
1365             case float_round_nearest_even:
1366                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
1367                 break;
1368             case float_round_ties_away:
1369                 inc = frac_lsbm1;
1370                 break;
1371             case float_round_to_zero:
1372                 inc = 0;
1373                 break;
1374             case float_round_up:
1375                 inc = a.sign ? 0 : rnd_mask;
1376                 break;
1377             case float_round_down:
1378                 inc = a.sign ? rnd_mask : 0;
1379                 break;
1380             default:
1381                 g_assert_not_reached();
1382             }
1383 
1384             if (a.frac & rnd_mask) {
1385                 s->float_exception_flags |= float_flag_inexact;
1386                 a.frac += inc;
1387                 a.frac &= ~rnd_mask;
1388                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1389                     a.frac >>= 1;
1390                     a.exp++;
1391                 }
1392             }
1393         }
1394         break;
1395     default:
1396         g_assert_not_reached();
1397     }
1398     return a;
1399 }
1400 
1401 float16 float16_round_to_int(float16 a, float_status *s)
1402 {
1403     FloatParts pa = float16_unpack_canonical(a, s);
1404     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1405     return float16_round_pack_canonical(pr, s);
1406 }
1407 
1408 float32 float32_round_to_int(float32 a, float_status *s)
1409 {
1410     FloatParts pa = float32_unpack_canonical(a, s);
1411     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1412     return float32_round_pack_canonical(pr, s);
1413 }
1414 
1415 float64 float64_round_to_int(float64 a, float_status *s)
1416 {
1417     FloatParts pa = float64_unpack_canonical(a, s);
1418     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
1419     return float64_round_pack_canonical(pr, s);
1420 }
1421 
1422 /*
1423  * Returns the result of converting the floating-point value `a' to
1424  * the two's complement integer format. The conversion is performed
1425  * according to the IEC/IEEE Standard for Binary Floating-Point
1426  * Arithmetic---which means in particular that the conversion is
1427  * rounded according to the current rounding mode. If `a' is a NaN,
1428  * the largest positive integer is returned. Otherwise, if the
1429  * conversion overflows, the largest integer with the same sign as `a'
1430  * is returned.
1431 */
1432 
1433 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
1434                                      int64_t min, int64_t max,
1435                                      float_status *s)
1436 {
1437     uint64_t r;
1438     int orig_flags = get_float_exception_flags(s);
1439     FloatParts p = round_to_int(in, rmode, scale, s);
1440 
1441     switch (p.cls) {
1442     case float_class_snan:
1443     case float_class_qnan:
1444         s->float_exception_flags = orig_flags | float_flag_invalid;
1445         return max;
1446     case float_class_inf:
1447         s->float_exception_flags = orig_flags | float_flag_invalid;
1448         return p.sign ? min : max;
1449     case float_class_zero:
1450         return 0;
1451     case float_class_normal:
1452         if (p.exp < DECOMPOSED_BINARY_POINT) {
1453             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1454         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1455             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1456         } else {
1457             r = UINT64_MAX;
1458         }
1459         if (p.sign) {
1460             if (r <= -(uint64_t) min) {
1461                 return -r;
1462             } else {
1463                 s->float_exception_flags = orig_flags | float_flag_invalid;
1464                 return min;
1465             }
1466         } else {
1467             if (r <= max) {
1468                 return r;
1469             } else {
1470                 s->float_exception_flags = orig_flags | float_flag_invalid;
1471                 return max;
1472             }
1473         }
1474     default:
1475         g_assert_not_reached();
1476     }
1477 }
1478 
1479 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
1480                                 float_status *s)
1481 {
1482     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1483                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1484 }
1485 
1486 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
1487                                 float_status *s)
1488 {
1489     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1490                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1491 }
1492 
1493 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
1494                                 float_status *s)
1495 {
1496     return round_to_int_and_pack(float16_unpack_canonical(a, s),
1497                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1498 }
1499 
1500 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
1501                                 float_status *s)
1502 {
1503     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1504                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1505 }
1506 
1507 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
1508                                 float_status *s)
1509 {
1510     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1511                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1512 }
1513 
1514 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
1515                                 float_status *s)
1516 {
1517     return round_to_int_and_pack(float32_unpack_canonical(a, s),
1518                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1519 }
1520 
1521 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
1522                                 float_status *s)
1523 {
1524     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1525                                  rmode, scale, INT16_MIN, INT16_MAX, s);
1526 }
1527 
1528 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
1529                                 float_status *s)
1530 {
1531     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1532                                  rmode, scale, INT32_MIN, INT32_MAX, s);
1533 }
1534 
1535 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
1536                                 float_status *s)
1537 {
1538     return round_to_int_and_pack(float64_unpack_canonical(a, s),
1539                                  rmode, scale, INT64_MIN, INT64_MAX, s);
1540 }
1541 
1542 int16_t float16_to_int16(float16 a, float_status *s)
1543 {
1544     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1545 }
1546 
1547 int32_t float16_to_int32(float16 a, float_status *s)
1548 {
1549     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1550 }
1551 
1552 int64_t float16_to_int64(float16 a, float_status *s)
1553 {
1554     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1555 }
1556 
1557 int16_t float32_to_int16(float32 a, float_status *s)
1558 {
1559     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1560 }
1561 
1562 int32_t float32_to_int32(float32 a, float_status *s)
1563 {
1564     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1565 }
1566 
1567 int64_t float32_to_int64(float32 a, float_status *s)
1568 {
1569     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1570 }
1571 
1572 int16_t float64_to_int16(float64 a, float_status *s)
1573 {
1574     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
1575 }
1576 
1577 int32_t float64_to_int32(float64 a, float_status *s)
1578 {
1579     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
1580 }
1581 
1582 int64_t float64_to_int64(float64 a, float_status *s)
1583 {
1584     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
1585 }
1586 
1587 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
1588 {
1589     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
1590 }
1591 
1592 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
1593 {
1594     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
1595 }
1596 
1597 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
1598 {
1599     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
1600 }
1601 
1602 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
1603 {
1604     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
1605 }
1606 
1607 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
1608 {
1609     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
1610 }
1611 
1612 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
1613 {
1614     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
1615 }
1616 
1617 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
1618 {
1619     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
1620 }
1621 
1622 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
1623 {
1624     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
1625 }
1626 
1627 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
1628 {
1629     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
1630 }
1631 
1632 /*
1633  *  Returns the result of converting the floating-point value `a' to
1634  *  the unsigned integer format. The conversion is performed according
1635  *  to the IEC/IEEE Standard for Binary Floating-Point
1636  *  Arithmetic---which means in particular that the conversion is
1637  *  rounded according to the current rounding mode. If `a' is a NaN,
1638  *  the largest unsigned integer is returned. Otherwise, if the
1639  *  conversion overflows, the largest unsigned integer is returned. If
1640  *  the 'a' is negative, the result is rounded and zero is returned;
1641  *  values that do not round to zero will raise the inexact exception
1642  *  flag.
1643  */
1644 
1645 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
1646                                        uint64_t max, float_status *s)
1647 {
1648     int orig_flags = get_float_exception_flags(s);
1649     FloatParts p = round_to_int(in, rmode, scale, s);
1650     uint64_t r;
1651 
1652     switch (p.cls) {
1653     case float_class_snan:
1654     case float_class_qnan:
1655         s->float_exception_flags = orig_flags | float_flag_invalid;
1656         return max;
1657     case float_class_inf:
1658         s->float_exception_flags = orig_flags | float_flag_invalid;
1659         return p.sign ? 0 : max;
1660     case float_class_zero:
1661         return 0;
1662     case float_class_normal:
1663         if (p.sign) {
1664             s->float_exception_flags = orig_flags | float_flag_invalid;
1665             return 0;
1666         }
1667 
1668         if (p.exp < DECOMPOSED_BINARY_POINT) {
1669             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
1670         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
1671             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
1672         } else {
1673             s->float_exception_flags = orig_flags | float_flag_invalid;
1674             return max;
1675         }
1676 
1677         /* For uint64 this will never trip, but if p.exp is too large
1678          * to shift a decomposed fraction we shall have exited via the
1679          * 3rd leg above.
1680          */
1681         if (r > max) {
1682             s->float_exception_flags = orig_flags | float_flag_invalid;
1683             return max;
1684         }
1685         return r;
1686     default:
1687         g_assert_not_reached();
1688     }
1689 }
1690 
1691 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
1692                                   float_status *s)
1693 {
1694     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1695                                   rmode, scale, UINT16_MAX, s);
1696 }
1697 
1698 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
1699                                   float_status *s)
1700 {
1701     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1702                                   rmode, scale, UINT32_MAX, s);
1703 }
1704 
1705 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
1706                                   float_status *s)
1707 {
1708     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
1709                                   rmode, scale, UINT64_MAX, s);
1710 }
1711 
1712 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
1713                                   float_status *s)
1714 {
1715     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1716                                   rmode, scale, UINT16_MAX, s);
1717 }
1718 
1719 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
1720                                   float_status *s)
1721 {
1722     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1723                                   rmode, scale, UINT32_MAX, s);
1724 }
1725 
1726 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
1727                                   float_status *s)
1728 {
1729     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
1730                                   rmode, scale, UINT64_MAX, s);
1731 }
1732 
1733 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
1734                                   float_status *s)
1735 {
1736     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1737                                   rmode, scale, UINT16_MAX, s);
1738 }
1739 
1740 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
1741                                   float_status *s)
1742 {
1743     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1744                                   rmode, scale, UINT32_MAX, s);
1745 }
1746 
1747 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
1748                                   float_status *s)
1749 {
1750     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
1751                                   rmode, scale, UINT64_MAX, s);
1752 }
1753 
1754 uint16_t float16_to_uint16(float16 a, float_status *s)
1755 {
1756     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1757 }
1758 
1759 uint32_t float16_to_uint32(float16 a, float_status *s)
1760 {
1761     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1762 }
1763 
1764 uint64_t float16_to_uint64(float16 a, float_status *s)
1765 {
1766     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1767 }
1768 
1769 uint16_t float32_to_uint16(float32 a, float_status *s)
1770 {
1771     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1772 }
1773 
1774 uint32_t float32_to_uint32(float32 a, float_status *s)
1775 {
1776     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1777 }
1778 
1779 uint64_t float32_to_uint64(float32 a, float_status *s)
1780 {
1781     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1782 }
1783 
1784 uint16_t float64_to_uint16(float64 a, float_status *s)
1785 {
1786     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
1787 }
1788 
1789 uint32_t float64_to_uint32(float64 a, float_status *s)
1790 {
1791     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
1792 }
1793 
1794 uint64_t float64_to_uint64(float64 a, float_status *s)
1795 {
1796     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
1797 }
1798 
1799 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
1800 {
1801     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1802 }
1803 
1804 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
1805 {
1806     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1807 }
1808 
1809 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
1810 {
1811     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1812 }
1813 
1814 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
1815 {
1816     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1817 }
1818 
1819 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
1820 {
1821     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1822 }
1823 
1824 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
1825 {
1826     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1827 }
1828 
1829 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
1830 {
1831     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
1832 }
1833 
1834 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
1835 {
1836     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
1837 }
1838 
1839 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
1840 {
1841     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
1842 }
1843 
1844 /*
1845  * Integer to float conversions
1846  *
1847  * Returns the result of converting the two's complement integer `a'
1848  * to the floating-point format. The conversion is performed according
1849  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1850  */
1851 
1852 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
1853 {
1854     FloatParts r = { .sign = false };
1855 
1856     if (a == 0) {
1857         r.cls = float_class_zero;
1858     } else {
1859         uint64_t f = a;
1860         int shift;
1861 
1862         r.cls = float_class_normal;
1863         if (a < 0) {
1864             f = -f;
1865             r.sign = true;
1866         }
1867         shift = clz64(f) - 1;
1868         scale = MIN(MAX(scale, -0x10000), 0x10000);
1869 
1870         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1871         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
1872     }
1873 
1874     return r;
1875 }
1876 
1877 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
1878 {
1879     FloatParts pa = int_to_float(a, scale, status);
1880     return float16_round_pack_canonical(pa, status);
1881 }
1882 
1883 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
1884 {
1885     return int64_to_float16_scalbn(a, scale, status);
1886 }
1887 
1888 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
1889 {
1890     return int64_to_float16_scalbn(a, scale, status);
1891 }
1892 
1893 float16 int64_to_float16(int64_t a, float_status *status)
1894 {
1895     return int64_to_float16_scalbn(a, 0, status);
1896 }
1897 
1898 float16 int32_to_float16(int32_t a, float_status *status)
1899 {
1900     return int64_to_float16_scalbn(a, 0, status);
1901 }
1902 
1903 float16 int16_to_float16(int16_t a, float_status *status)
1904 {
1905     return int64_to_float16_scalbn(a, 0, status);
1906 }
1907 
1908 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
1909 {
1910     FloatParts pa = int_to_float(a, scale, status);
1911     return float32_round_pack_canonical(pa, status);
1912 }
1913 
1914 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
1915 {
1916     return int64_to_float32_scalbn(a, scale, status);
1917 }
1918 
1919 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
1920 {
1921     return int64_to_float32_scalbn(a, scale, status);
1922 }
1923 
1924 float32 int64_to_float32(int64_t a, float_status *status)
1925 {
1926     return int64_to_float32_scalbn(a, 0, status);
1927 }
1928 
1929 float32 int32_to_float32(int32_t a, float_status *status)
1930 {
1931     return int64_to_float32_scalbn(a, 0, status);
1932 }
1933 
1934 float32 int16_to_float32(int16_t a, float_status *status)
1935 {
1936     return int64_to_float32_scalbn(a, 0, status);
1937 }
1938 
1939 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
1940 {
1941     FloatParts pa = int_to_float(a, scale, status);
1942     return float64_round_pack_canonical(pa, status);
1943 }
1944 
1945 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
1946 {
1947     return int64_to_float64_scalbn(a, scale, status);
1948 }
1949 
1950 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
1951 {
1952     return int64_to_float64_scalbn(a, scale, status);
1953 }
1954 
1955 float64 int64_to_float64(int64_t a, float_status *status)
1956 {
1957     return int64_to_float64_scalbn(a, 0, status);
1958 }
1959 
1960 float64 int32_to_float64(int32_t a, float_status *status)
1961 {
1962     return int64_to_float64_scalbn(a, 0, status);
1963 }
1964 
1965 float64 int16_to_float64(int16_t a, float_status *status)
1966 {
1967     return int64_to_float64_scalbn(a, 0, status);
1968 }
1969 
1970 
1971 /*
1972  * Unsigned Integer to float conversions
1973  *
1974  * Returns the result of converting the unsigned integer `a' to the
1975  * floating-point format. The conversion is performed according to the
1976  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1977  */
1978 
1979 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
1980 {
1981     FloatParts r = { .sign = false };
1982 
1983     if (a == 0) {
1984         r.cls = float_class_zero;
1985     } else {
1986         scale = MIN(MAX(scale, -0x10000), 0x10000);
1987         r.cls = float_class_normal;
1988         if ((int64_t)a < 0) {
1989             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
1990             shift64RightJamming(a, 1, &a);
1991             r.frac = a;
1992         } else {
1993             int shift = clz64(a) - 1;
1994             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
1995             r.frac = a << shift;
1996         }
1997     }
1998 
1999     return r;
2000 }
2001 
2002 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2003 {
2004     FloatParts pa = uint_to_float(a, scale, status);
2005     return float16_round_pack_canonical(pa, status);
2006 }
2007 
2008 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2009 {
2010     return uint64_to_float16_scalbn(a, scale, status);
2011 }
2012 
2013 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2014 {
2015     return uint64_to_float16_scalbn(a, scale, status);
2016 }
2017 
2018 float16 uint64_to_float16(uint64_t a, float_status *status)
2019 {
2020     return uint64_to_float16_scalbn(a, 0, status);
2021 }
2022 
2023 float16 uint32_to_float16(uint32_t a, float_status *status)
2024 {
2025     return uint64_to_float16_scalbn(a, 0, status);
2026 }
2027 
2028 float16 uint16_to_float16(uint16_t a, float_status *status)
2029 {
2030     return uint64_to_float16_scalbn(a, 0, status);
2031 }
2032 
2033 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2034 {
2035     FloatParts pa = uint_to_float(a, scale, status);
2036     return float32_round_pack_canonical(pa, status);
2037 }
2038 
2039 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2040 {
2041     return uint64_to_float32_scalbn(a, scale, status);
2042 }
2043 
2044 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2045 {
2046     return uint64_to_float32_scalbn(a, scale, status);
2047 }
2048 
2049 float32 uint64_to_float32(uint64_t a, float_status *status)
2050 {
2051     return uint64_to_float32_scalbn(a, 0, status);
2052 }
2053 
2054 float32 uint32_to_float32(uint32_t a, float_status *status)
2055 {
2056     return uint64_to_float32_scalbn(a, 0, status);
2057 }
2058 
2059 float32 uint16_to_float32(uint16_t a, float_status *status)
2060 {
2061     return uint64_to_float32_scalbn(a, 0, status);
2062 }
2063 
2064 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2065 {
2066     FloatParts pa = uint_to_float(a, scale, status);
2067     return float64_round_pack_canonical(pa, status);
2068 }
2069 
2070 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2071 {
2072     return uint64_to_float64_scalbn(a, scale, status);
2073 }
2074 
2075 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2076 {
2077     return uint64_to_float64_scalbn(a, scale, status);
2078 }
2079 
2080 float64 uint64_to_float64(uint64_t a, float_status *status)
2081 {
2082     return uint64_to_float64_scalbn(a, 0, status);
2083 }
2084 
2085 float64 uint32_to_float64(uint32_t a, float_status *status)
2086 {
2087     return uint64_to_float64_scalbn(a, 0, status);
2088 }
2089 
2090 float64 uint16_to_float64(uint16_t a, float_status *status)
2091 {
2092     return uint64_to_float64_scalbn(a, 0, status);
2093 }
2094 
2095 /* Float Min/Max */
2096 /* min() and max() functions. These can't be implemented as
2097  * 'compare and pick one input' because that would mishandle
2098  * NaNs and +0 vs -0.
2099  *
2100  * minnum() and maxnum() functions. These are similar to the min()
2101  * and max() functions but if one of the arguments is a QNaN and
2102  * the other is numerical then the numerical argument is returned.
2103  * SNaNs will get quietened before being returned.
2104  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2105  * and maxNum() operations. min() and max() are the typical min/max
2106  * semantics provided by many CPUs which predate that specification.
2107  *
2108  * minnummag() and maxnummag() functions correspond to minNumMag()
2109  * and minNumMag() from the IEEE-754 2008.
2110  */
2111 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2112                                 bool ieee, bool ismag, float_status *s)
2113 {
2114     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2115         if (ieee) {
2116             /* Takes two floating-point values `a' and `b', one of
2117              * which is a NaN, and returns the appropriate NaN
2118              * result. If either `a' or `b' is a signaling NaN,
2119              * the invalid exception is raised.
2120              */
2121             if (is_snan(a.cls) || is_snan(b.cls)) {
2122                 return pick_nan(a, b, s);
2123             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2124                 return b;
2125             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2126                 return a;
2127             }
2128         }
2129         return pick_nan(a, b, s);
2130     } else {
2131         int a_exp, b_exp;
2132 
2133         switch (a.cls) {
2134         case float_class_normal:
2135             a_exp = a.exp;
2136             break;
2137         case float_class_inf:
2138             a_exp = INT_MAX;
2139             break;
2140         case float_class_zero:
2141             a_exp = INT_MIN;
2142             break;
2143         default:
2144             g_assert_not_reached();
2145             break;
2146         }
2147         switch (b.cls) {
2148         case float_class_normal:
2149             b_exp = b.exp;
2150             break;
2151         case float_class_inf:
2152             b_exp = INT_MAX;
2153             break;
2154         case float_class_zero:
2155             b_exp = INT_MIN;
2156             break;
2157         default:
2158             g_assert_not_reached();
2159             break;
2160         }
2161 
2162         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2163             bool a_less = a_exp < b_exp;
2164             if (a_exp == b_exp) {
2165                 a_less = a.frac < b.frac;
2166             }
2167             return a_less ^ ismin ? b : a;
2168         }
2169 
2170         if (a.sign == b.sign) {
2171             bool a_less = a_exp < b_exp;
2172             if (a_exp == b_exp) {
2173                 a_less = a.frac < b.frac;
2174             }
2175             return a.sign ^ a_less ^ ismin ? b : a;
2176         } else {
2177             return a.sign ^ ismin ? b : a;
2178         }
2179     }
2180 }
2181 
2182 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2183 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2184                                      float_status *s)                   \
2185 {                                                                       \
2186     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2187     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2188     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2189                                                                         \
2190     return float ## sz ## _round_pack_canonical(pr, s);                 \
2191 }
2192 
2193 MINMAX(16, min, true, false, false)
2194 MINMAX(16, minnum, true, true, false)
2195 MINMAX(16, minnummag, true, true, true)
2196 MINMAX(16, max, false, false, false)
2197 MINMAX(16, maxnum, false, true, false)
2198 MINMAX(16, maxnummag, false, true, true)
2199 
2200 MINMAX(32, min, true, false, false)
2201 MINMAX(32, minnum, true, true, false)
2202 MINMAX(32, minnummag, true, true, true)
2203 MINMAX(32, max, false, false, false)
2204 MINMAX(32, maxnum, false, true, false)
2205 MINMAX(32, maxnummag, false, true, true)
2206 
2207 MINMAX(64, min, true, false, false)
2208 MINMAX(64, minnum, true, true, false)
2209 MINMAX(64, minnummag, true, true, true)
2210 MINMAX(64, max, false, false, false)
2211 MINMAX(64, maxnum, false, true, false)
2212 MINMAX(64, maxnummag, false, true, true)
2213 
2214 #undef MINMAX
2215 
2216 /* Floating point compare */
2217 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2218                           float_status *s)
2219 {
2220     if (is_nan(a.cls) || is_nan(b.cls)) {
2221         if (!is_quiet ||
2222             a.cls == float_class_snan ||
2223             b.cls == float_class_snan) {
2224             s->float_exception_flags |= float_flag_invalid;
2225         }
2226         return float_relation_unordered;
2227     }
2228 
2229     if (a.cls == float_class_zero) {
2230         if (b.cls == float_class_zero) {
2231             return float_relation_equal;
2232         }
2233         return b.sign ? float_relation_greater : float_relation_less;
2234     } else if (b.cls == float_class_zero) {
2235         return a.sign ? float_relation_less : float_relation_greater;
2236     }
2237 
2238     /* The only really important thing about infinity is its sign. If
2239      * both are infinities the sign marks the smallest of the two.
2240      */
2241     if (a.cls == float_class_inf) {
2242         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2243             return float_relation_equal;
2244         }
2245         return a.sign ? float_relation_less : float_relation_greater;
2246     } else if (b.cls == float_class_inf) {
2247         return b.sign ? float_relation_greater : float_relation_less;
2248     }
2249 
2250     if (a.sign != b.sign) {
2251         return a.sign ? float_relation_less : float_relation_greater;
2252     }
2253 
2254     if (a.exp == b.exp) {
2255         if (a.frac == b.frac) {
2256             return float_relation_equal;
2257         }
2258         if (a.sign) {
2259             return a.frac > b.frac ?
2260                 float_relation_less : float_relation_greater;
2261         } else {
2262             return a.frac > b.frac ?
2263                 float_relation_greater : float_relation_less;
2264         }
2265     } else {
2266         if (a.sign) {
2267             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2268         } else {
2269             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2270         }
2271     }
2272 }
2273 
2274 #define COMPARE(sz)                                                     \
2275 int float ## sz ## _compare(float ## sz a, float ## sz b,               \
2276                             float_status *s)                            \
2277 {                                                                       \
2278     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2279     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2280     return compare_floats(pa, pb, false, s);                            \
2281 }                                                                       \
2282 int float ## sz ## _compare_quiet(float ## sz a, float ## sz b,         \
2283                                   float_status *s)                      \
2284 {                                                                       \
2285     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2286     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2287     return compare_floats(pa, pb, true, s);                             \
2288 }
2289 
2290 COMPARE(16)
2291 COMPARE(32)
2292 COMPARE(64)
2293 
2294 #undef COMPARE
2295 
2296 /* Multiply A by 2 raised to the power N.  */
2297 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
2298 {
2299     if (unlikely(is_nan(a.cls))) {
2300         return return_nan(a, s);
2301     }
2302     if (a.cls == float_class_normal) {
2303         /* The largest float type (even though not supported by FloatParts)
2304          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
2305          * still allows rounding to infinity, without allowing overflow
2306          * within the int32_t that backs FloatParts.exp.
2307          */
2308         n = MIN(MAX(n, -0x10000), 0x10000);
2309         a.exp += n;
2310     }
2311     return a;
2312 }
2313 
2314 float16 float16_scalbn(float16 a, int n, float_status *status)
2315 {
2316     FloatParts pa = float16_unpack_canonical(a, status);
2317     FloatParts pr = scalbn_decomposed(pa, n, status);
2318     return float16_round_pack_canonical(pr, status);
2319 }
2320 
2321 float32 float32_scalbn(float32 a, int n, float_status *status)
2322 {
2323     FloatParts pa = float32_unpack_canonical(a, status);
2324     FloatParts pr = scalbn_decomposed(pa, n, status);
2325     return float32_round_pack_canonical(pr, status);
2326 }
2327 
2328 float64 float64_scalbn(float64 a, int n, float_status *status)
2329 {
2330     FloatParts pa = float64_unpack_canonical(a, status);
2331     FloatParts pr = scalbn_decomposed(pa, n, status);
2332     return float64_round_pack_canonical(pr, status);
2333 }
2334 
2335 /*
2336  * Square Root
2337  *
2338  * The old softfloat code did an approximation step before zeroing in
2339  * on the final result. However for simpleness we just compute the
2340  * square root by iterating down from the implicit bit to enough extra
2341  * bits to ensure we get a correctly rounded result.
2342  *
2343  * This does mean however the calculation is slower than before,
2344  * especially for 64 bit floats.
2345  */
2346 
2347 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
2348 {
2349     uint64_t a_frac, r_frac, s_frac;
2350     int bit, last_bit;
2351 
2352     if (is_nan(a.cls)) {
2353         return return_nan(a, s);
2354     }
2355     if (a.cls == float_class_zero) {
2356         return a;  /* sqrt(+-0) = +-0 */
2357     }
2358     if (a.sign) {
2359         s->float_exception_flags |= float_flag_invalid;
2360         return parts_default_nan(s);
2361     }
2362     if (a.cls == float_class_inf) {
2363         return a;  /* sqrt(+inf) = +inf */
2364     }
2365 
2366     assert(a.cls == float_class_normal);
2367 
2368     /* We need two overflow bits at the top. Adding room for that is a
2369      * right shift. If the exponent is odd, we can discard the low bit
2370      * by multiplying the fraction by 2; that's a left shift. Combine
2371      * those and we shift right if the exponent is even.
2372      */
2373     a_frac = a.frac;
2374     if (!(a.exp & 1)) {
2375         a_frac >>= 1;
2376     }
2377     a.exp >>= 1;
2378 
2379     /* Bit-by-bit computation of sqrt.  */
2380     r_frac = 0;
2381     s_frac = 0;
2382 
2383     /* Iterate from implicit bit down to the 3 extra bits to compute a
2384      * properly rounded result. Remember we've inserted one more bit
2385      * at the top, so these positions are one less.
2386      */
2387     bit = DECOMPOSED_BINARY_POINT - 1;
2388     last_bit = MAX(p->frac_shift - 4, 0);
2389     do {
2390         uint64_t q = 1ULL << bit;
2391         uint64_t t_frac = s_frac + q;
2392         if (t_frac <= a_frac) {
2393             s_frac = t_frac + q;
2394             a_frac -= t_frac;
2395             r_frac += q;
2396         }
2397         a_frac <<= 1;
2398     } while (--bit >= last_bit);
2399 
2400     /* Undo the right shift done above. If there is any remaining
2401      * fraction, the result is inexact. Set the sticky bit.
2402      */
2403     a.frac = (r_frac << 1) + (a_frac != 0);
2404 
2405     return a;
2406 }
2407 
2408 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
2409 {
2410     FloatParts pa = float16_unpack_canonical(a, status);
2411     FloatParts pr = sqrt_float(pa, status, &float16_params);
2412     return float16_round_pack_canonical(pr, status);
2413 }
2414 
2415 float32 QEMU_FLATTEN float32_sqrt(float32 a, float_status *status)
2416 {
2417     FloatParts pa = float32_unpack_canonical(a, status);
2418     FloatParts pr = sqrt_float(pa, status, &float32_params);
2419     return float32_round_pack_canonical(pr, status);
2420 }
2421 
2422 float64 QEMU_FLATTEN float64_sqrt(float64 a, float_status *status)
2423 {
2424     FloatParts pa = float64_unpack_canonical(a, status);
2425     FloatParts pr = sqrt_float(pa, status, &float64_params);
2426     return float64_round_pack_canonical(pr, status);
2427 }
2428 
2429 /*----------------------------------------------------------------------------
2430 | The pattern for a default generated NaN.
2431 *----------------------------------------------------------------------------*/
2432 
2433 float16 float16_default_nan(float_status *status)
2434 {
2435     FloatParts p = parts_default_nan(status);
2436     p.frac >>= float16_params.frac_shift;
2437     return float16_pack_raw(p);
2438 }
2439 
2440 float32 float32_default_nan(float_status *status)
2441 {
2442     FloatParts p = parts_default_nan(status);
2443     p.frac >>= float32_params.frac_shift;
2444     return float32_pack_raw(p);
2445 }
2446 
2447 float64 float64_default_nan(float_status *status)
2448 {
2449     FloatParts p = parts_default_nan(status);
2450     p.frac >>= float64_params.frac_shift;
2451     return float64_pack_raw(p);
2452 }
2453 
2454 float128 float128_default_nan(float_status *status)
2455 {
2456     FloatParts p = parts_default_nan(status);
2457     float128 r;
2458 
2459     /* Extrapolate from the choices made by parts_default_nan to fill
2460      * in the quad-floating format.  If the low bit is set, assume we
2461      * want to set all non-snan bits.
2462      */
2463     r.low = -(p.frac & 1);
2464     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
2465     r.high |= LIT64(0x7FFF000000000000);
2466     r.high |= (uint64_t)p.sign << 63;
2467 
2468     return r;
2469 }
2470 
2471 /*----------------------------------------------------------------------------
2472 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
2473 *----------------------------------------------------------------------------*/
2474 
2475 float16 float16_silence_nan(float16 a, float_status *status)
2476 {
2477     FloatParts p = float16_unpack_raw(a);
2478     p.frac <<= float16_params.frac_shift;
2479     p = parts_silence_nan(p, status);
2480     p.frac >>= float16_params.frac_shift;
2481     return float16_pack_raw(p);
2482 }
2483 
2484 float32 float32_silence_nan(float32 a, float_status *status)
2485 {
2486     FloatParts p = float32_unpack_raw(a);
2487     p.frac <<= float32_params.frac_shift;
2488     p = parts_silence_nan(p, status);
2489     p.frac >>= float32_params.frac_shift;
2490     return float32_pack_raw(p);
2491 }
2492 
2493 float64 float64_silence_nan(float64 a, float_status *status)
2494 {
2495     FloatParts p = float64_unpack_raw(a);
2496     p.frac <<= float64_params.frac_shift;
2497     p = parts_silence_nan(p, status);
2498     p.frac >>= float64_params.frac_shift;
2499     return float64_pack_raw(p);
2500 }
2501 
2502 /*----------------------------------------------------------------------------
2503 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
2504 | and 7, and returns the properly rounded 32-bit integer corresponding to the
2505 | input.  If `zSign' is 1, the input is negated before being converted to an
2506 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
2507 | is simply rounded to an integer, with the inexact exception raised if the
2508 | input cannot be represented exactly as an integer.  However, if the fixed-
2509 | point input is too large, the invalid exception is raised and the largest
2510 | positive or negative integer is returned.
2511 *----------------------------------------------------------------------------*/
2512 
2513 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
2514 {
2515     int8_t roundingMode;
2516     flag roundNearestEven;
2517     int8_t roundIncrement, roundBits;
2518     int32_t z;
2519 
2520     roundingMode = status->float_rounding_mode;
2521     roundNearestEven = ( roundingMode == float_round_nearest_even );
2522     switch (roundingMode) {
2523     case float_round_nearest_even:
2524     case float_round_ties_away:
2525         roundIncrement = 0x40;
2526         break;
2527     case float_round_to_zero:
2528         roundIncrement = 0;
2529         break;
2530     case float_round_up:
2531         roundIncrement = zSign ? 0 : 0x7f;
2532         break;
2533     case float_round_down:
2534         roundIncrement = zSign ? 0x7f : 0;
2535         break;
2536     default:
2537         abort();
2538     }
2539     roundBits = absZ & 0x7F;
2540     absZ = ( absZ + roundIncrement )>>7;
2541     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2542     z = absZ;
2543     if ( zSign ) z = - z;
2544     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
2545         float_raise(float_flag_invalid, status);
2546         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2547     }
2548     if (roundBits) {
2549         status->float_exception_flags |= float_flag_inexact;
2550     }
2551     return z;
2552 
2553 }
2554 
2555 /*----------------------------------------------------------------------------
2556 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2557 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2558 | and returns the properly rounded 64-bit integer corresponding to the input.
2559 | If `zSign' is 1, the input is negated before being converted to an integer.
2560 | Ordinarily, the fixed-point input is simply rounded to an integer, with
2561 | the inexact exception raised if the input cannot be represented exactly as
2562 | an integer.  However, if the fixed-point input is too large, the invalid
2563 | exception is raised and the largest positive or negative integer is
2564 | returned.
2565 *----------------------------------------------------------------------------*/
2566 
2567 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
2568                                float_status *status)
2569 {
2570     int8_t roundingMode;
2571     flag roundNearestEven, increment;
2572     int64_t z;
2573 
2574     roundingMode = status->float_rounding_mode;
2575     roundNearestEven = ( roundingMode == float_round_nearest_even );
2576     switch (roundingMode) {
2577     case float_round_nearest_even:
2578     case float_round_ties_away:
2579         increment = ((int64_t) absZ1 < 0);
2580         break;
2581     case float_round_to_zero:
2582         increment = 0;
2583         break;
2584     case float_round_up:
2585         increment = !zSign && absZ1;
2586         break;
2587     case float_round_down:
2588         increment = zSign && absZ1;
2589         break;
2590     default:
2591         abort();
2592     }
2593     if ( increment ) {
2594         ++absZ0;
2595         if ( absZ0 == 0 ) goto overflow;
2596         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
2597     }
2598     z = absZ0;
2599     if ( zSign ) z = - z;
2600     if ( z && ( ( z < 0 ) ^ zSign ) ) {
2601  overflow:
2602         float_raise(float_flag_invalid, status);
2603         return
2604               zSign ? (int64_t) LIT64( 0x8000000000000000 )
2605             : LIT64( 0x7FFFFFFFFFFFFFFF );
2606     }
2607     if (absZ1) {
2608         status->float_exception_flags |= float_flag_inexact;
2609     }
2610     return z;
2611 
2612 }
2613 
2614 /*----------------------------------------------------------------------------
2615 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
2616 | `absZ1', with binary point between bits 63 and 64 (between the input words),
2617 | and returns the properly rounded 64-bit unsigned integer corresponding to the
2618 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
2619 | with the inexact exception raised if the input cannot be represented exactly
2620 | as an integer.  However, if the fixed-point input is too large, the invalid
2621 | exception is raised and the largest unsigned integer is returned.
2622 *----------------------------------------------------------------------------*/
2623 
2624 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
2625                                 uint64_t absZ1, float_status *status)
2626 {
2627     int8_t roundingMode;
2628     flag roundNearestEven, increment;
2629 
2630     roundingMode = status->float_rounding_mode;
2631     roundNearestEven = (roundingMode == float_round_nearest_even);
2632     switch (roundingMode) {
2633     case float_round_nearest_even:
2634     case float_round_ties_away:
2635         increment = ((int64_t)absZ1 < 0);
2636         break;
2637     case float_round_to_zero:
2638         increment = 0;
2639         break;
2640     case float_round_up:
2641         increment = !zSign && absZ1;
2642         break;
2643     case float_round_down:
2644         increment = zSign && absZ1;
2645         break;
2646     default:
2647         abort();
2648     }
2649     if (increment) {
2650         ++absZ0;
2651         if (absZ0 == 0) {
2652             float_raise(float_flag_invalid, status);
2653             return LIT64(0xFFFFFFFFFFFFFFFF);
2654         }
2655         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
2656     }
2657 
2658     if (zSign && absZ0) {
2659         float_raise(float_flag_invalid, status);
2660         return 0;
2661     }
2662 
2663     if (absZ1) {
2664         status->float_exception_flags |= float_flag_inexact;
2665     }
2666     return absZ0;
2667 }
2668 
2669 /*----------------------------------------------------------------------------
2670 | If `a' is denormal and we are in flush-to-zero mode then set the
2671 | input-denormal exception and return zero. Otherwise just return the value.
2672 *----------------------------------------------------------------------------*/
2673 float32 float32_squash_input_denormal(float32 a, float_status *status)
2674 {
2675     if (status->flush_inputs_to_zero) {
2676         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
2677             float_raise(float_flag_input_denormal, status);
2678             return make_float32(float32_val(a) & 0x80000000);
2679         }
2680     }
2681     return a;
2682 }
2683 
2684 /*----------------------------------------------------------------------------
2685 | Normalizes the subnormal single-precision floating-point value represented
2686 | by the denormalized significand `aSig'.  The normalized exponent and
2687 | significand are stored at the locations pointed to by `zExpPtr' and
2688 | `zSigPtr', respectively.
2689 *----------------------------------------------------------------------------*/
2690 
2691 static void
2692  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
2693 {
2694     int8_t shiftCount;
2695 
2696     shiftCount = clz32(aSig) - 8;
2697     *zSigPtr = aSig<<shiftCount;
2698     *zExpPtr = 1 - shiftCount;
2699 
2700 }
2701 
2702 /*----------------------------------------------------------------------------
2703 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2704 | and significand `zSig', and returns the proper single-precision floating-
2705 | point value corresponding to the abstract input.  Ordinarily, the abstract
2706 | value is simply rounded and packed into the single-precision format, with
2707 | the inexact exception raised if the abstract input cannot be represented
2708 | exactly.  However, if the abstract value is too large, the overflow and
2709 | inexact exceptions are raised and an infinity or maximal finite value is
2710 | returned.  If the abstract value is too small, the input value is rounded to
2711 | a subnormal number, and the underflow and inexact exceptions are raised if
2712 | the abstract input cannot be represented exactly as a subnormal single-
2713 | precision floating-point number.
2714 |     The input significand `zSig' has its binary point between bits 30
2715 | and 29, which is 7 bits to the left of the usual location.  This shifted
2716 | significand must be normalized or smaller.  If `zSig' is not normalized,
2717 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2718 | and it must not require rounding.  In the usual case that `zSig' is
2719 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2720 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2721 | Binary Floating-Point Arithmetic.
2722 *----------------------------------------------------------------------------*/
2723 
2724 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2725                                    float_status *status)
2726 {
2727     int8_t roundingMode;
2728     flag roundNearestEven;
2729     int8_t roundIncrement, roundBits;
2730     flag isTiny;
2731 
2732     roundingMode = status->float_rounding_mode;
2733     roundNearestEven = ( roundingMode == float_round_nearest_even );
2734     switch (roundingMode) {
2735     case float_round_nearest_even:
2736     case float_round_ties_away:
2737         roundIncrement = 0x40;
2738         break;
2739     case float_round_to_zero:
2740         roundIncrement = 0;
2741         break;
2742     case float_round_up:
2743         roundIncrement = zSign ? 0 : 0x7f;
2744         break;
2745     case float_round_down:
2746         roundIncrement = zSign ? 0x7f : 0;
2747         break;
2748     default:
2749         abort();
2750         break;
2751     }
2752     roundBits = zSig & 0x7F;
2753     if ( 0xFD <= (uint16_t) zExp ) {
2754         if (    ( 0xFD < zExp )
2755              || (    ( zExp == 0xFD )
2756                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
2757            ) {
2758             float_raise(float_flag_overflow | float_flag_inexact, status);
2759             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
2760         }
2761         if ( zExp < 0 ) {
2762             if (status->flush_to_zero) {
2763                 float_raise(float_flag_output_denormal, status);
2764                 return packFloat32(zSign, 0, 0);
2765             }
2766             isTiny =
2767                 (status->float_detect_tininess
2768                  == float_tininess_before_rounding)
2769                 || ( zExp < -1 )
2770                 || ( zSig + roundIncrement < 0x80000000 );
2771             shift32RightJamming( zSig, - zExp, &zSig );
2772             zExp = 0;
2773             roundBits = zSig & 0x7F;
2774             if (isTiny && roundBits) {
2775                 float_raise(float_flag_underflow, status);
2776             }
2777         }
2778     }
2779     if (roundBits) {
2780         status->float_exception_flags |= float_flag_inexact;
2781     }
2782     zSig = ( zSig + roundIncrement )>>7;
2783     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
2784     if ( zSig == 0 ) zExp = 0;
2785     return packFloat32( zSign, zExp, zSig );
2786 
2787 }
2788 
2789 /*----------------------------------------------------------------------------
2790 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2791 | and significand `zSig', and returns the proper single-precision floating-
2792 | point value corresponding to the abstract input.  This routine is just like
2793 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
2794 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2795 | floating-point exponent.
2796 *----------------------------------------------------------------------------*/
2797 
2798 static float32
2799  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
2800                               float_status *status)
2801 {
2802     int8_t shiftCount;
2803 
2804     shiftCount = clz32(zSig) - 1;
2805     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
2806                                status);
2807 
2808 }
2809 
2810 /*----------------------------------------------------------------------------
2811 | If `a' is denormal and we are in flush-to-zero mode then set the
2812 | input-denormal exception and return zero. Otherwise just return the value.
2813 *----------------------------------------------------------------------------*/
2814 float64 float64_squash_input_denormal(float64 a, float_status *status)
2815 {
2816     if (status->flush_inputs_to_zero) {
2817         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
2818             float_raise(float_flag_input_denormal, status);
2819             return make_float64(float64_val(a) & (1ULL << 63));
2820         }
2821     }
2822     return a;
2823 }
2824 
2825 /*----------------------------------------------------------------------------
2826 | Normalizes the subnormal double-precision floating-point value represented
2827 | by the denormalized significand `aSig'.  The normalized exponent and
2828 | significand are stored at the locations pointed to by `zExpPtr' and
2829 | `zSigPtr', respectively.
2830 *----------------------------------------------------------------------------*/
2831 
2832 static void
2833  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
2834 {
2835     int8_t shiftCount;
2836 
2837     shiftCount = clz64(aSig) - 11;
2838     *zSigPtr = aSig<<shiftCount;
2839     *zExpPtr = 1 - shiftCount;
2840 
2841 }
2842 
2843 /*----------------------------------------------------------------------------
2844 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2845 | double-precision floating-point value, returning the result.  After being
2846 | shifted into the proper positions, the three fields are simply added
2847 | together to form the result.  This means that any integer portion of `zSig'
2848 | will be added into the exponent.  Since a properly normalized significand
2849 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2850 | than the desired result exponent whenever `zSig' is a complete, normalized
2851 | significand.
2852 *----------------------------------------------------------------------------*/
2853 
2854 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
2855 {
2856 
2857     return make_float64(
2858         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
2859 
2860 }
2861 
2862 /*----------------------------------------------------------------------------
2863 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2864 | and significand `zSig', and returns the proper double-precision floating-
2865 | point value corresponding to the abstract input.  Ordinarily, the abstract
2866 | value is simply rounded and packed into the double-precision format, with
2867 | the inexact exception raised if the abstract input cannot be represented
2868 | exactly.  However, if the abstract value is too large, the overflow and
2869 | inexact exceptions are raised and an infinity or maximal finite value is
2870 | returned.  If the abstract value is too small, the input value is rounded to
2871 | a subnormal number, and the underflow and inexact exceptions are raised if
2872 | the abstract input cannot be represented exactly as a subnormal double-
2873 | precision floating-point number.
2874 |     The input significand `zSig' has its binary point between bits 62
2875 | and 61, which is 10 bits to the left of the usual location.  This shifted
2876 | significand must be normalized or smaller.  If `zSig' is not normalized,
2877 | `zExp' must be 0; in that case, the result returned is a subnormal number,
2878 | and it must not require rounding.  In the usual case that `zSig' is
2879 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
2880 | The handling of underflow and overflow follows the IEC/IEEE Standard for
2881 | Binary Floating-Point Arithmetic.
2882 *----------------------------------------------------------------------------*/
2883 
2884 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2885                                    float_status *status)
2886 {
2887     int8_t roundingMode;
2888     flag roundNearestEven;
2889     int roundIncrement, roundBits;
2890     flag isTiny;
2891 
2892     roundingMode = status->float_rounding_mode;
2893     roundNearestEven = ( roundingMode == float_round_nearest_even );
2894     switch (roundingMode) {
2895     case float_round_nearest_even:
2896     case float_round_ties_away:
2897         roundIncrement = 0x200;
2898         break;
2899     case float_round_to_zero:
2900         roundIncrement = 0;
2901         break;
2902     case float_round_up:
2903         roundIncrement = zSign ? 0 : 0x3ff;
2904         break;
2905     case float_round_down:
2906         roundIncrement = zSign ? 0x3ff : 0;
2907         break;
2908     case float_round_to_odd:
2909         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2910         break;
2911     default:
2912         abort();
2913     }
2914     roundBits = zSig & 0x3FF;
2915     if ( 0x7FD <= (uint16_t) zExp ) {
2916         if (    ( 0x7FD < zExp )
2917              || (    ( zExp == 0x7FD )
2918                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
2919            ) {
2920             bool overflow_to_inf = roundingMode != float_round_to_odd &&
2921                                    roundIncrement != 0;
2922             float_raise(float_flag_overflow | float_flag_inexact, status);
2923             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
2924         }
2925         if ( zExp < 0 ) {
2926             if (status->flush_to_zero) {
2927                 float_raise(float_flag_output_denormal, status);
2928                 return packFloat64(zSign, 0, 0);
2929             }
2930             isTiny =
2931                    (status->float_detect_tininess
2932                     == float_tininess_before_rounding)
2933                 || ( zExp < -1 )
2934                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
2935             shift64RightJamming( zSig, - zExp, &zSig );
2936             zExp = 0;
2937             roundBits = zSig & 0x3FF;
2938             if (isTiny && roundBits) {
2939                 float_raise(float_flag_underflow, status);
2940             }
2941             if (roundingMode == float_round_to_odd) {
2942                 /*
2943                  * For round-to-odd case, the roundIncrement depends on
2944                  * zSig which just changed.
2945                  */
2946                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
2947             }
2948         }
2949     }
2950     if (roundBits) {
2951         status->float_exception_flags |= float_flag_inexact;
2952     }
2953     zSig = ( zSig + roundIncrement )>>10;
2954     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
2955     if ( zSig == 0 ) zExp = 0;
2956     return packFloat64( zSign, zExp, zSig );
2957 
2958 }
2959 
2960 /*----------------------------------------------------------------------------
2961 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
2962 | and significand `zSig', and returns the proper double-precision floating-
2963 | point value corresponding to the abstract input.  This routine is just like
2964 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
2965 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
2966 | floating-point exponent.
2967 *----------------------------------------------------------------------------*/
2968 
2969 static float64
2970  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
2971                               float_status *status)
2972 {
2973     int8_t shiftCount;
2974 
2975     shiftCount = clz64(zSig) - 1;
2976     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
2977                                status);
2978 
2979 }
2980 
2981 /*----------------------------------------------------------------------------
2982 | Normalizes the subnormal extended double-precision floating-point value
2983 | represented by the denormalized significand `aSig'.  The normalized exponent
2984 | and significand are stored at the locations pointed to by `zExpPtr' and
2985 | `zSigPtr', respectively.
2986 *----------------------------------------------------------------------------*/
2987 
2988 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
2989                                 uint64_t *zSigPtr)
2990 {
2991     int8_t shiftCount;
2992 
2993     shiftCount = clz64(aSig);
2994     *zSigPtr = aSig<<shiftCount;
2995     *zExpPtr = 1 - shiftCount;
2996 }
2997 
2998 /*----------------------------------------------------------------------------
2999 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3000 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3001 | and returns the proper extended double-precision floating-point value
3002 | corresponding to the abstract input.  Ordinarily, the abstract value is
3003 | rounded and packed into the extended double-precision format, with the
3004 | inexact exception raised if the abstract input cannot be represented
3005 | exactly.  However, if the abstract value is too large, the overflow and
3006 | inexact exceptions are raised and an infinity or maximal finite value is
3007 | returned.  If the abstract value is too small, the input value is rounded to
3008 | a subnormal number, and the underflow and inexact exceptions are raised if
3009 | the abstract input cannot be represented exactly as a subnormal extended
3010 | double-precision floating-point number.
3011 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3012 | number of bits as single or double precision, respectively.  Otherwise, the
3013 | result is rounded to the full precision of the extended double-precision
3014 | format.
3015 |     The input significand must be normalized or smaller.  If the input
3016 | significand is not normalized, `zExp' must be 0; in that case, the result
3017 | returned is a subnormal number, and it must not require rounding.  The
3018 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3019 | Floating-Point Arithmetic.
3020 *----------------------------------------------------------------------------*/
3021 
3022 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3023                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3024                               float_status *status)
3025 {
3026     int8_t roundingMode;
3027     flag roundNearestEven, increment, isTiny;
3028     int64_t roundIncrement, roundMask, roundBits;
3029 
3030     roundingMode = status->float_rounding_mode;
3031     roundNearestEven = ( roundingMode == float_round_nearest_even );
3032     if ( roundingPrecision == 80 ) goto precision80;
3033     if ( roundingPrecision == 64 ) {
3034         roundIncrement = LIT64( 0x0000000000000400 );
3035         roundMask = LIT64( 0x00000000000007FF );
3036     }
3037     else if ( roundingPrecision == 32 ) {
3038         roundIncrement = LIT64( 0x0000008000000000 );
3039         roundMask = LIT64( 0x000000FFFFFFFFFF );
3040     }
3041     else {
3042         goto precision80;
3043     }
3044     zSig0 |= ( zSig1 != 0 );
3045     switch (roundingMode) {
3046     case float_round_nearest_even:
3047     case float_round_ties_away:
3048         break;
3049     case float_round_to_zero:
3050         roundIncrement = 0;
3051         break;
3052     case float_round_up:
3053         roundIncrement = zSign ? 0 : roundMask;
3054         break;
3055     case float_round_down:
3056         roundIncrement = zSign ? roundMask : 0;
3057         break;
3058     default:
3059         abort();
3060     }
3061     roundBits = zSig0 & roundMask;
3062     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3063         if (    ( 0x7FFE < zExp )
3064              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3065            ) {
3066             goto overflow;
3067         }
3068         if ( zExp <= 0 ) {
3069             if (status->flush_to_zero) {
3070                 float_raise(float_flag_output_denormal, status);
3071                 return packFloatx80(zSign, 0, 0);
3072             }
3073             isTiny =
3074                    (status->float_detect_tininess
3075                     == float_tininess_before_rounding)
3076                 || ( zExp < 0 )
3077                 || ( zSig0 <= zSig0 + roundIncrement );
3078             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3079             zExp = 0;
3080             roundBits = zSig0 & roundMask;
3081             if (isTiny && roundBits) {
3082                 float_raise(float_flag_underflow, status);
3083             }
3084             if (roundBits) {
3085                 status->float_exception_flags |= float_flag_inexact;
3086             }
3087             zSig0 += roundIncrement;
3088             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3089             roundIncrement = roundMask + 1;
3090             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3091                 roundMask |= roundIncrement;
3092             }
3093             zSig0 &= ~ roundMask;
3094             return packFloatx80( zSign, zExp, zSig0 );
3095         }
3096     }
3097     if (roundBits) {
3098         status->float_exception_flags |= float_flag_inexact;
3099     }
3100     zSig0 += roundIncrement;
3101     if ( zSig0 < roundIncrement ) {
3102         ++zExp;
3103         zSig0 = LIT64( 0x8000000000000000 );
3104     }
3105     roundIncrement = roundMask + 1;
3106     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3107         roundMask |= roundIncrement;
3108     }
3109     zSig0 &= ~ roundMask;
3110     if ( zSig0 == 0 ) zExp = 0;
3111     return packFloatx80( zSign, zExp, zSig0 );
3112  precision80:
3113     switch (roundingMode) {
3114     case float_round_nearest_even:
3115     case float_round_ties_away:
3116         increment = ((int64_t)zSig1 < 0);
3117         break;
3118     case float_round_to_zero:
3119         increment = 0;
3120         break;
3121     case float_round_up:
3122         increment = !zSign && zSig1;
3123         break;
3124     case float_round_down:
3125         increment = zSign && zSig1;
3126         break;
3127     default:
3128         abort();
3129     }
3130     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3131         if (    ( 0x7FFE < zExp )
3132              || (    ( zExp == 0x7FFE )
3133                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3134                   && increment
3135                 )
3136            ) {
3137             roundMask = 0;
3138  overflow:
3139             float_raise(float_flag_overflow | float_flag_inexact, status);
3140             if (    ( roundingMode == float_round_to_zero )
3141                  || ( zSign && ( roundingMode == float_round_up ) )
3142                  || ( ! zSign && ( roundingMode == float_round_down ) )
3143                ) {
3144                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3145             }
3146             return packFloatx80(zSign,
3147                                 floatx80_infinity_high,
3148                                 floatx80_infinity_low);
3149         }
3150         if ( zExp <= 0 ) {
3151             isTiny =
3152                    (status->float_detect_tininess
3153                     == float_tininess_before_rounding)
3154                 || ( zExp < 0 )
3155                 || ! increment
3156                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3157             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3158             zExp = 0;
3159             if (isTiny && zSig1) {
3160                 float_raise(float_flag_underflow, status);
3161             }
3162             if (zSig1) {
3163                 status->float_exception_flags |= float_flag_inexact;
3164             }
3165             switch (roundingMode) {
3166             case float_round_nearest_even:
3167             case float_round_ties_away:
3168                 increment = ((int64_t)zSig1 < 0);
3169                 break;
3170             case float_round_to_zero:
3171                 increment = 0;
3172                 break;
3173             case float_round_up:
3174                 increment = !zSign && zSig1;
3175                 break;
3176             case float_round_down:
3177                 increment = zSign && zSig1;
3178                 break;
3179             default:
3180                 abort();
3181             }
3182             if ( increment ) {
3183                 ++zSig0;
3184                 zSig0 &=
3185                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3186                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
3187             }
3188             return packFloatx80( zSign, zExp, zSig0 );
3189         }
3190     }
3191     if (zSig1) {
3192         status->float_exception_flags |= float_flag_inexact;
3193     }
3194     if ( increment ) {
3195         ++zSig0;
3196         if ( zSig0 == 0 ) {
3197             ++zExp;
3198             zSig0 = LIT64( 0x8000000000000000 );
3199         }
3200         else {
3201             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
3202         }
3203     }
3204     else {
3205         if ( zSig0 == 0 ) zExp = 0;
3206     }
3207     return packFloatx80( zSign, zExp, zSig0 );
3208 
3209 }
3210 
3211 /*----------------------------------------------------------------------------
3212 | Takes an abstract floating-point value having sign `zSign', exponent
3213 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
3214 | and returns the proper extended double-precision floating-point value
3215 | corresponding to the abstract input.  This routine is just like
3216 | `roundAndPackFloatx80' except that the input significand does not have to be
3217 | normalized.
3218 *----------------------------------------------------------------------------*/
3219 
3220 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
3221                                        flag zSign, int32_t zExp,
3222                                        uint64_t zSig0, uint64_t zSig1,
3223                                        float_status *status)
3224 {
3225     int8_t shiftCount;
3226 
3227     if ( zSig0 == 0 ) {
3228         zSig0 = zSig1;
3229         zSig1 = 0;
3230         zExp -= 64;
3231     }
3232     shiftCount = clz64(zSig0);
3233     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3234     zExp -= shiftCount;
3235     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
3236                                 zSig0, zSig1, status);
3237 
3238 }
3239 
3240 /*----------------------------------------------------------------------------
3241 | Returns the least-significant 64 fraction bits of the quadruple-precision
3242 | floating-point value `a'.
3243 *----------------------------------------------------------------------------*/
3244 
3245 static inline uint64_t extractFloat128Frac1( float128 a )
3246 {
3247 
3248     return a.low;
3249 
3250 }
3251 
3252 /*----------------------------------------------------------------------------
3253 | Returns the most-significant 48 fraction bits of the quadruple-precision
3254 | floating-point value `a'.
3255 *----------------------------------------------------------------------------*/
3256 
3257 static inline uint64_t extractFloat128Frac0( float128 a )
3258 {
3259 
3260     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
3261 
3262 }
3263 
3264 /*----------------------------------------------------------------------------
3265 | Returns the exponent bits of the quadruple-precision floating-point value
3266 | `a'.
3267 *----------------------------------------------------------------------------*/
3268 
3269 static inline int32_t extractFloat128Exp( float128 a )
3270 {
3271 
3272     return ( a.high>>48 ) & 0x7FFF;
3273 
3274 }
3275 
3276 /*----------------------------------------------------------------------------
3277 | Returns the sign bit of the quadruple-precision floating-point value `a'.
3278 *----------------------------------------------------------------------------*/
3279 
3280 static inline flag extractFloat128Sign( float128 a )
3281 {
3282 
3283     return a.high>>63;
3284 
3285 }
3286 
3287 /*----------------------------------------------------------------------------
3288 | Normalizes the subnormal quadruple-precision floating-point value
3289 | represented by the denormalized significand formed by the concatenation of
3290 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
3291 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
3292 | significand are stored at the location pointed to by `zSig0Ptr', and the
3293 | least significant 64 bits of the normalized significand are stored at the
3294 | location pointed to by `zSig1Ptr'.
3295 *----------------------------------------------------------------------------*/
3296 
3297 static void
3298  normalizeFloat128Subnormal(
3299      uint64_t aSig0,
3300      uint64_t aSig1,
3301      int32_t *zExpPtr,
3302      uint64_t *zSig0Ptr,
3303      uint64_t *zSig1Ptr
3304  )
3305 {
3306     int8_t shiftCount;
3307 
3308     if ( aSig0 == 0 ) {
3309         shiftCount = clz64(aSig1) - 15;
3310         if ( shiftCount < 0 ) {
3311             *zSig0Ptr = aSig1>>( - shiftCount );
3312             *zSig1Ptr = aSig1<<( shiftCount & 63 );
3313         }
3314         else {
3315             *zSig0Ptr = aSig1<<shiftCount;
3316             *zSig1Ptr = 0;
3317         }
3318         *zExpPtr = - shiftCount - 63;
3319     }
3320     else {
3321         shiftCount = clz64(aSig0) - 15;
3322         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
3323         *zExpPtr = 1 - shiftCount;
3324     }
3325 
3326 }
3327 
3328 /*----------------------------------------------------------------------------
3329 | Packs the sign `zSign', the exponent `zExp', and the significand formed
3330 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
3331 | floating-point value, returning the result.  After being shifted into the
3332 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
3333 | added together to form the most significant 32 bits of the result.  This
3334 | means that any integer portion of `zSig0' will be added into the exponent.
3335 | Since a properly normalized significand will have an integer portion equal
3336 | to 1, the `zExp' input should be 1 less than the desired result exponent
3337 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
3338 | significand.
3339 *----------------------------------------------------------------------------*/
3340 
3341 static inline float128
3342  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
3343 {
3344     float128 z;
3345 
3346     z.low = zSig1;
3347     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
3348     return z;
3349 
3350 }
3351 
3352 /*----------------------------------------------------------------------------
3353 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3354 | and extended significand formed by the concatenation of `zSig0', `zSig1',
3355 | and `zSig2', and returns the proper quadruple-precision floating-point value
3356 | corresponding to the abstract input.  Ordinarily, the abstract value is
3357 | simply rounded and packed into the quadruple-precision format, with the
3358 | inexact exception raised if the abstract input cannot be represented
3359 | exactly.  However, if the abstract value is too large, the overflow and
3360 | inexact exceptions are raised and an infinity or maximal finite value is
3361 | returned.  If the abstract value is too small, the input value is rounded to
3362 | a subnormal number, and the underflow and inexact exceptions are raised if
3363 | the abstract input cannot be represented exactly as a subnormal quadruple-
3364 | precision floating-point number.
3365 |     The input significand must be normalized or smaller.  If the input
3366 | significand is not normalized, `zExp' must be 0; in that case, the result
3367 | returned is a subnormal number, and it must not require rounding.  In the
3368 | usual case that the input significand is normalized, `zExp' must be 1 less
3369 | than the ``true'' floating-point exponent.  The handling of underflow and
3370 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3371 *----------------------------------------------------------------------------*/
3372 
3373 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
3374                                      uint64_t zSig0, uint64_t zSig1,
3375                                      uint64_t zSig2, float_status *status)
3376 {
3377     int8_t roundingMode;
3378     flag roundNearestEven, increment, isTiny;
3379 
3380     roundingMode = status->float_rounding_mode;
3381     roundNearestEven = ( roundingMode == float_round_nearest_even );
3382     switch (roundingMode) {
3383     case float_round_nearest_even:
3384     case float_round_ties_away:
3385         increment = ((int64_t)zSig2 < 0);
3386         break;
3387     case float_round_to_zero:
3388         increment = 0;
3389         break;
3390     case float_round_up:
3391         increment = !zSign && zSig2;
3392         break;
3393     case float_round_down:
3394         increment = zSign && zSig2;
3395         break;
3396     case float_round_to_odd:
3397         increment = !(zSig1 & 0x1) && zSig2;
3398         break;
3399     default:
3400         abort();
3401     }
3402     if ( 0x7FFD <= (uint32_t) zExp ) {
3403         if (    ( 0x7FFD < zExp )
3404              || (    ( zExp == 0x7FFD )
3405                   && eq128(
3406                          LIT64( 0x0001FFFFFFFFFFFF ),
3407                          LIT64( 0xFFFFFFFFFFFFFFFF ),
3408                          zSig0,
3409                          zSig1
3410                      )
3411                   && increment
3412                 )
3413            ) {
3414             float_raise(float_flag_overflow | float_flag_inexact, status);
3415             if (    ( roundingMode == float_round_to_zero )
3416                  || ( zSign && ( roundingMode == float_round_up ) )
3417                  || ( ! zSign && ( roundingMode == float_round_down ) )
3418                  || (roundingMode == float_round_to_odd)
3419                ) {
3420                 return
3421                     packFloat128(
3422                         zSign,
3423                         0x7FFE,
3424                         LIT64( 0x0000FFFFFFFFFFFF ),
3425                         LIT64( 0xFFFFFFFFFFFFFFFF )
3426                     );
3427             }
3428             return packFloat128( zSign, 0x7FFF, 0, 0 );
3429         }
3430         if ( zExp < 0 ) {
3431             if (status->flush_to_zero) {
3432                 float_raise(float_flag_output_denormal, status);
3433                 return packFloat128(zSign, 0, 0, 0);
3434             }
3435             isTiny =
3436                    (status->float_detect_tininess
3437                     == float_tininess_before_rounding)
3438                 || ( zExp < -1 )
3439                 || ! increment
3440                 || lt128(
3441                        zSig0,
3442                        zSig1,
3443                        LIT64( 0x0001FFFFFFFFFFFF ),
3444                        LIT64( 0xFFFFFFFFFFFFFFFF )
3445                    );
3446             shift128ExtraRightJamming(
3447                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
3448             zExp = 0;
3449             if (isTiny && zSig2) {
3450                 float_raise(float_flag_underflow, status);
3451             }
3452             switch (roundingMode) {
3453             case float_round_nearest_even:
3454             case float_round_ties_away:
3455                 increment = ((int64_t)zSig2 < 0);
3456                 break;
3457             case float_round_to_zero:
3458                 increment = 0;
3459                 break;
3460             case float_round_up:
3461                 increment = !zSign && zSig2;
3462                 break;
3463             case float_round_down:
3464                 increment = zSign && zSig2;
3465                 break;
3466             case float_round_to_odd:
3467                 increment = !(zSig1 & 0x1) && zSig2;
3468                 break;
3469             default:
3470                 abort();
3471             }
3472         }
3473     }
3474     if (zSig2) {
3475         status->float_exception_flags |= float_flag_inexact;
3476     }
3477     if ( increment ) {
3478         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
3479         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
3480     }
3481     else {
3482         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
3483     }
3484     return packFloat128( zSign, zExp, zSig0, zSig1 );
3485 
3486 }
3487 
3488 /*----------------------------------------------------------------------------
3489 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3490 | and significand formed by the concatenation of `zSig0' and `zSig1', and
3491 | returns the proper quadruple-precision floating-point value corresponding
3492 | to the abstract input.  This routine is just like `roundAndPackFloat128'
3493 | except that the input significand has fewer bits and does not have to be
3494 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
3495 | point exponent.
3496 *----------------------------------------------------------------------------*/
3497 
3498 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
3499                                               uint64_t zSig0, uint64_t zSig1,
3500                                               float_status *status)
3501 {
3502     int8_t shiftCount;
3503     uint64_t zSig2;
3504 
3505     if ( zSig0 == 0 ) {
3506         zSig0 = zSig1;
3507         zSig1 = 0;
3508         zExp -= 64;
3509     }
3510     shiftCount = clz64(zSig0) - 15;
3511     if ( 0 <= shiftCount ) {
3512         zSig2 = 0;
3513         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3514     }
3515     else {
3516         shift128ExtraRightJamming(
3517             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
3518     }
3519     zExp -= shiftCount;
3520     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
3521 
3522 }
3523 
3524 
3525 /*----------------------------------------------------------------------------
3526 | Returns the result of converting the 32-bit two's complement integer `a'
3527 | to the extended double-precision floating-point format.  The conversion
3528 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3529 | Arithmetic.
3530 *----------------------------------------------------------------------------*/
3531 
3532 floatx80 int32_to_floatx80(int32_t a, float_status *status)
3533 {
3534     flag zSign;
3535     uint32_t absA;
3536     int8_t shiftCount;
3537     uint64_t zSig;
3538 
3539     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3540     zSign = ( a < 0 );
3541     absA = zSign ? - a : a;
3542     shiftCount = clz32(absA) + 32;
3543     zSig = absA;
3544     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
3545 
3546 }
3547 
3548 /*----------------------------------------------------------------------------
3549 | Returns the result of converting the 32-bit two's complement integer `a' to
3550 | the quadruple-precision floating-point format.  The conversion is performed
3551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3552 *----------------------------------------------------------------------------*/
3553 
3554 float128 int32_to_float128(int32_t a, float_status *status)
3555 {
3556     flag zSign;
3557     uint32_t absA;
3558     int8_t shiftCount;
3559     uint64_t zSig0;
3560 
3561     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3562     zSign = ( a < 0 );
3563     absA = zSign ? - a : a;
3564     shiftCount = clz32(absA) + 17;
3565     zSig0 = absA;
3566     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
3567 
3568 }
3569 
3570 /*----------------------------------------------------------------------------
3571 | Returns the result of converting the 64-bit two's complement integer `a'
3572 | to the extended double-precision floating-point format.  The conversion
3573 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3574 | Arithmetic.
3575 *----------------------------------------------------------------------------*/
3576 
3577 floatx80 int64_to_floatx80(int64_t a, float_status *status)
3578 {
3579     flag zSign;
3580     uint64_t absA;
3581     int8_t shiftCount;
3582 
3583     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
3584     zSign = ( a < 0 );
3585     absA = zSign ? - a : a;
3586     shiftCount = clz64(absA);
3587     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
3588 
3589 }
3590 
3591 /*----------------------------------------------------------------------------
3592 | Returns the result of converting the 64-bit two's complement integer `a' to
3593 | the quadruple-precision floating-point format.  The conversion is performed
3594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3595 *----------------------------------------------------------------------------*/
3596 
3597 float128 int64_to_float128(int64_t a, float_status *status)
3598 {
3599     flag zSign;
3600     uint64_t absA;
3601     int8_t shiftCount;
3602     int32_t zExp;
3603     uint64_t zSig0, zSig1;
3604 
3605     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
3606     zSign = ( a < 0 );
3607     absA = zSign ? - a : a;
3608     shiftCount = clz64(absA) + 49;
3609     zExp = 0x406E - shiftCount;
3610     if ( 64 <= shiftCount ) {
3611         zSig1 = 0;
3612         zSig0 = absA;
3613         shiftCount -= 64;
3614     }
3615     else {
3616         zSig1 = absA;
3617         zSig0 = 0;
3618     }
3619     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
3620     return packFloat128( zSign, zExp, zSig0, zSig1 );
3621 
3622 }
3623 
3624 /*----------------------------------------------------------------------------
3625 | Returns the result of converting the 64-bit unsigned integer `a'
3626 | to the quadruple-precision floating-point format.  The conversion is performed
3627 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3628 *----------------------------------------------------------------------------*/
3629 
3630 float128 uint64_to_float128(uint64_t a, float_status *status)
3631 {
3632     if (a == 0) {
3633         return float128_zero;
3634     }
3635     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
3636 }
3637 
3638 /*----------------------------------------------------------------------------
3639 | Returns the result of converting the single-precision floating-point value
3640 | `a' to the extended double-precision floating-point format.  The conversion
3641 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3642 | Arithmetic.
3643 *----------------------------------------------------------------------------*/
3644 
3645 floatx80 float32_to_floatx80(float32 a, float_status *status)
3646 {
3647     flag aSign;
3648     int aExp;
3649     uint32_t aSig;
3650 
3651     a = float32_squash_input_denormal(a, status);
3652     aSig = extractFloat32Frac( a );
3653     aExp = extractFloat32Exp( a );
3654     aSign = extractFloat32Sign( a );
3655     if ( aExp == 0xFF ) {
3656         if (aSig) {
3657             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
3658         }
3659         return packFloatx80(aSign,
3660                             floatx80_infinity_high,
3661                             floatx80_infinity_low);
3662     }
3663     if ( aExp == 0 ) {
3664         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3665         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3666     }
3667     aSig |= 0x00800000;
3668     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
3669 
3670 }
3671 
3672 /*----------------------------------------------------------------------------
3673 | Returns the result of converting the single-precision floating-point value
3674 | `a' to the double-precision floating-point format.  The conversion is
3675 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3676 | Arithmetic.
3677 *----------------------------------------------------------------------------*/
3678 
3679 float128 float32_to_float128(float32 a, float_status *status)
3680 {
3681     flag aSign;
3682     int aExp;
3683     uint32_t aSig;
3684 
3685     a = float32_squash_input_denormal(a, status);
3686     aSig = extractFloat32Frac( a );
3687     aExp = extractFloat32Exp( a );
3688     aSign = extractFloat32Sign( a );
3689     if ( aExp == 0xFF ) {
3690         if (aSig) {
3691             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
3692         }
3693         return packFloat128( aSign, 0x7FFF, 0, 0 );
3694     }
3695     if ( aExp == 0 ) {
3696         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3697         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3698         --aExp;
3699     }
3700     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
3701 
3702 }
3703 
3704 /*----------------------------------------------------------------------------
3705 | Returns the remainder of the single-precision floating-point value `a'
3706 | with respect to the corresponding value `b'.  The operation is performed
3707 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3708 *----------------------------------------------------------------------------*/
3709 
3710 float32 float32_rem(float32 a, float32 b, float_status *status)
3711 {
3712     flag aSign, zSign;
3713     int aExp, bExp, expDiff;
3714     uint32_t aSig, bSig;
3715     uint32_t q;
3716     uint64_t aSig64, bSig64, q64;
3717     uint32_t alternateASig;
3718     int32_t sigMean;
3719     a = float32_squash_input_denormal(a, status);
3720     b = float32_squash_input_denormal(b, status);
3721 
3722     aSig = extractFloat32Frac( a );
3723     aExp = extractFloat32Exp( a );
3724     aSign = extractFloat32Sign( a );
3725     bSig = extractFloat32Frac( b );
3726     bExp = extractFloat32Exp( b );
3727     if ( aExp == 0xFF ) {
3728         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
3729             return propagateFloat32NaN(a, b, status);
3730         }
3731         float_raise(float_flag_invalid, status);
3732         return float32_default_nan(status);
3733     }
3734     if ( bExp == 0xFF ) {
3735         if (bSig) {
3736             return propagateFloat32NaN(a, b, status);
3737         }
3738         return a;
3739     }
3740     if ( bExp == 0 ) {
3741         if ( bSig == 0 ) {
3742             float_raise(float_flag_invalid, status);
3743             return float32_default_nan(status);
3744         }
3745         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
3746     }
3747     if ( aExp == 0 ) {
3748         if ( aSig == 0 ) return a;
3749         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3750     }
3751     expDiff = aExp - bExp;
3752     aSig |= 0x00800000;
3753     bSig |= 0x00800000;
3754     if ( expDiff < 32 ) {
3755         aSig <<= 8;
3756         bSig <<= 8;
3757         if ( expDiff < 0 ) {
3758             if ( expDiff < -1 ) return a;
3759             aSig >>= 1;
3760         }
3761         q = ( bSig <= aSig );
3762         if ( q ) aSig -= bSig;
3763         if ( 0 < expDiff ) {
3764             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
3765             q >>= 32 - expDiff;
3766             bSig >>= 2;
3767             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3768         }
3769         else {
3770             aSig >>= 2;
3771             bSig >>= 2;
3772         }
3773     }
3774     else {
3775         if ( bSig <= aSig ) aSig -= bSig;
3776         aSig64 = ( (uint64_t) aSig )<<40;
3777         bSig64 = ( (uint64_t) bSig )<<40;
3778         expDiff -= 64;
3779         while ( 0 < expDiff ) {
3780             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3781             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3782             aSig64 = - ( ( bSig * q64 )<<38 );
3783             expDiff -= 62;
3784         }
3785         expDiff += 64;
3786         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
3787         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
3788         q = q64>>( 64 - expDiff );
3789         bSig <<= 6;
3790         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
3791     }
3792     do {
3793         alternateASig = aSig;
3794         ++q;
3795         aSig -= bSig;
3796     } while ( 0 <= (int32_t) aSig );
3797     sigMean = aSig + alternateASig;
3798     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3799         aSig = alternateASig;
3800     }
3801     zSign = ( (int32_t) aSig < 0 );
3802     if ( zSign ) aSig = - aSig;
3803     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
3804 }
3805 
3806 
3807 
3808 /*----------------------------------------------------------------------------
3809 | Returns the binary exponential of the single-precision floating-point value
3810 | `a'. The operation is performed according to the IEC/IEEE Standard for
3811 | Binary Floating-Point Arithmetic.
3812 |
3813 | Uses the following identities:
3814 |
3815 | 1. -------------------------------------------------------------------------
3816 |      x    x*ln(2)
3817 |     2  = e
3818 |
3819 | 2. -------------------------------------------------------------------------
3820 |                      2     3     4     5           n
3821 |      x        x     x     x     x     x           x
3822 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
3823 |               1!    2!    3!    4!    5!          n!
3824 *----------------------------------------------------------------------------*/
3825 
3826 static const float64 float32_exp2_coefficients[15] =
3827 {
3828     const_float64( 0x3ff0000000000000ll ), /*  1 */
3829     const_float64( 0x3fe0000000000000ll ), /*  2 */
3830     const_float64( 0x3fc5555555555555ll ), /*  3 */
3831     const_float64( 0x3fa5555555555555ll ), /*  4 */
3832     const_float64( 0x3f81111111111111ll ), /*  5 */
3833     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
3834     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
3835     const_float64( 0x3efa01a01a01a01all ), /*  8 */
3836     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
3837     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
3838     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
3839     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
3840     const_float64( 0x3de6124613a86d09ll ), /* 13 */
3841     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
3842     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
3843 };
3844 
3845 float32 float32_exp2(float32 a, float_status *status)
3846 {
3847     flag aSign;
3848     int aExp;
3849     uint32_t aSig;
3850     float64 r, x, xn;
3851     int i;
3852     a = float32_squash_input_denormal(a, status);
3853 
3854     aSig = extractFloat32Frac( a );
3855     aExp = extractFloat32Exp( a );
3856     aSign = extractFloat32Sign( a );
3857 
3858     if ( aExp == 0xFF) {
3859         if (aSig) {
3860             return propagateFloat32NaN(a, float32_zero, status);
3861         }
3862         return (aSign) ? float32_zero : a;
3863     }
3864     if (aExp == 0) {
3865         if (aSig == 0) return float32_one;
3866     }
3867 
3868     float_raise(float_flag_inexact, status);
3869 
3870     /* ******************************* */
3871     /* using float64 for approximation */
3872     /* ******************************* */
3873     x = float32_to_float64(a, status);
3874     x = float64_mul(x, float64_ln2, status);
3875 
3876     xn = x;
3877     r = float64_one;
3878     for (i = 0 ; i < 15 ; i++) {
3879         float64 f;
3880 
3881         f = float64_mul(xn, float32_exp2_coefficients[i], status);
3882         r = float64_add(r, f, status);
3883 
3884         xn = float64_mul(xn, x, status);
3885     }
3886 
3887     return float64_to_float32(r, status);
3888 }
3889 
3890 /*----------------------------------------------------------------------------
3891 | Returns the binary log of the single-precision floating-point value `a'.
3892 | The operation is performed according to the IEC/IEEE Standard for Binary
3893 | Floating-Point Arithmetic.
3894 *----------------------------------------------------------------------------*/
3895 float32 float32_log2(float32 a, float_status *status)
3896 {
3897     flag aSign, zSign;
3898     int aExp;
3899     uint32_t aSig, zSig, i;
3900 
3901     a = float32_squash_input_denormal(a, status);
3902     aSig = extractFloat32Frac( a );
3903     aExp = extractFloat32Exp( a );
3904     aSign = extractFloat32Sign( a );
3905 
3906     if ( aExp == 0 ) {
3907         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
3908         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
3909     }
3910     if ( aSign ) {
3911         float_raise(float_flag_invalid, status);
3912         return float32_default_nan(status);
3913     }
3914     if ( aExp == 0xFF ) {
3915         if (aSig) {
3916             return propagateFloat32NaN(a, float32_zero, status);
3917         }
3918         return a;
3919     }
3920 
3921     aExp -= 0x7F;
3922     aSig |= 0x00800000;
3923     zSign = aExp < 0;
3924     zSig = aExp << 23;
3925 
3926     for (i = 1 << 22; i > 0; i >>= 1) {
3927         aSig = ( (uint64_t)aSig * aSig ) >> 23;
3928         if ( aSig & 0x01000000 ) {
3929             aSig >>= 1;
3930             zSig |= i;
3931         }
3932     }
3933 
3934     if ( zSign )
3935         zSig = -zSig;
3936 
3937     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
3938 }
3939 
3940 /*----------------------------------------------------------------------------
3941 | Returns 1 if the single-precision floating-point value `a' is equal to
3942 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3943 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3944 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3946 
3947 int float32_eq(float32 a, float32 b, float_status *status)
3948 {
3949     uint32_t av, bv;
3950     a = float32_squash_input_denormal(a, status);
3951     b = float32_squash_input_denormal(b, status);
3952 
3953     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3954          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3955        ) {
3956         float_raise(float_flag_invalid, status);
3957         return 0;
3958     }
3959     av = float32_val(a);
3960     bv = float32_val(b);
3961     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3962 }
3963 
3964 /*----------------------------------------------------------------------------
3965 | Returns 1 if the single-precision floating-point value `a' is less than
3966 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
3967 | exception is raised if either operand is a NaN.  The comparison is performed
3968 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3970 
3971 int float32_le(float32 a, float32 b, float_status *status)
3972 {
3973     flag aSign, bSign;
3974     uint32_t av, bv;
3975     a = float32_squash_input_denormal(a, status);
3976     b = float32_squash_input_denormal(b, status);
3977 
3978     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3979          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3980        ) {
3981         float_raise(float_flag_invalid, status);
3982         return 0;
3983     }
3984     aSign = extractFloat32Sign( a );
3985     bSign = extractFloat32Sign( b );
3986     av = float32_val(a);
3987     bv = float32_val(b);
3988     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3989     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3990 
3991 }
3992 
3993 /*----------------------------------------------------------------------------
3994 | Returns 1 if the single-precision floating-point value `a' is less than
3995 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3996 | raised if either operand is a NaN.  The comparison is performed according
3997 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3998 *----------------------------------------------------------------------------*/
3999 
4000 int float32_lt(float32 a, float32 b, float_status *status)
4001 {
4002     flag aSign, bSign;
4003     uint32_t av, bv;
4004     a = float32_squash_input_denormal(a, status);
4005     b = float32_squash_input_denormal(b, status);
4006 
4007     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4008          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4009        ) {
4010         float_raise(float_flag_invalid, status);
4011         return 0;
4012     }
4013     aSign = extractFloat32Sign( a );
4014     bSign = extractFloat32Sign( b );
4015     av = float32_val(a);
4016     bv = float32_val(b);
4017     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4018     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4019 
4020 }
4021 
4022 /*----------------------------------------------------------------------------
4023 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4024 | be compared, and 0 otherwise.  The invalid exception is raised if either
4025 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4026 | Standard for Binary Floating-Point Arithmetic.
4027 *----------------------------------------------------------------------------*/
4028 
4029 int float32_unordered(float32 a, float32 b, float_status *status)
4030 {
4031     a = float32_squash_input_denormal(a, status);
4032     b = float32_squash_input_denormal(b, status);
4033 
4034     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4035          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4036        ) {
4037         float_raise(float_flag_invalid, status);
4038         return 1;
4039     }
4040     return 0;
4041 }
4042 
4043 /*----------------------------------------------------------------------------
4044 | Returns 1 if the single-precision floating-point value `a' is equal to
4045 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4046 | exception.  The comparison is performed according to the IEC/IEEE Standard
4047 | for Binary Floating-Point Arithmetic.
4048 *----------------------------------------------------------------------------*/
4049 
4050 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4051 {
4052     a = float32_squash_input_denormal(a, status);
4053     b = float32_squash_input_denormal(b, status);
4054 
4055     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4056          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4057        ) {
4058         if (float32_is_signaling_nan(a, status)
4059          || float32_is_signaling_nan(b, status)) {
4060             float_raise(float_flag_invalid, status);
4061         }
4062         return 0;
4063     }
4064     return ( float32_val(a) == float32_val(b) ) ||
4065             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4066 }
4067 
4068 /*----------------------------------------------------------------------------
4069 | Returns 1 if the single-precision floating-point value `a' is less than or
4070 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4071 | cause an exception.  Otherwise, the comparison is performed according to the
4072 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4073 *----------------------------------------------------------------------------*/
4074 
4075 int float32_le_quiet(float32 a, float32 b, float_status *status)
4076 {
4077     flag aSign, bSign;
4078     uint32_t av, bv;
4079     a = float32_squash_input_denormal(a, status);
4080     b = float32_squash_input_denormal(b, status);
4081 
4082     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4083          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4084        ) {
4085         if (float32_is_signaling_nan(a, status)
4086          || float32_is_signaling_nan(b, status)) {
4087             float_raise(float_flag_invalid, status);
4088         }
4089         return 0;
4090     }
4091     aSign = extractFloat32Sign( a );
4092     bSign = extractFloat32Sign( b );
4093     av = float32_val(a);
4094     bv = float32_val(b);
4095     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4096     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4097 
4098 }
4099 
4100 /*----------------------------------------------------------------------------
4101 | Returns 1 if the single-precision floating-point value `a' is less than
4102 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4103 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4104 | Standard for Binary Floating-Point Arithmetic.
4105 *----------------------------------------------------------------------------*/
4106 
4107 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4108 {
4109     flag aSign, bSign;
4110     uint32_t av, bv;
4111     a = float32_squash_input_denormal(a, status);
4112     b = float32_squash_input_denormal(b, status);
4113 
4114     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4115          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4116        ) {
4117         if (float32_is_signaling_nan(a, status)
4118          || float32_is_signaling_nan(b, status)) {
4119             float_raise(float_flag_invalid, status);
4120         }
4121         return 0;
4122     }
4123     aSign = extractFloat32Sign( a );
4124     bSign = extractFloat32Sign( b );
4125     av = float32_val(a);
4126     bv = float32_val(b);
4127     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4128     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4129 
4130 }
4131 
4132 /*----------------------------------------------------------------------------
4133 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4134 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4135 | comparison is performed according to the IEC/IEEE Standard for Binary
4136 | Floating-Point Arithmetic.
4137 *----------------------------------------------------------------------------*/
4138 
4139 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4140 {
4141     a = float32_squash_input_denormal(a, status);
4142     b = float32_squash_input_denormal(b, status);
4143 
4144     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4145          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4146        ) {
4147         if (float32_is_signaling_nan(a, status)
4148          || float32_is_signaling_nan(b, status)) {
4149             float_raise(float_flag_invalid, status);
4150         }
4151         return 1;
4152     }
4153     return 0;
4154 }
4155 
4156 /*----------------------------------------------------------------------------
4157 | If `a' is denormal and we are in flush-to-zero mode then set the
4158 | input-denormal exception and return zero. Otherwise just return the value.
4159 *----------------------------------------------------------------------------*/
4160 float16 float16_squash_input_denormal(float16 a, float_status *status)
4161 {
4162     if (status->flush_inputs_to_zero) {
4163         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4164             float_raise(float_flag_input_denormal, status);
4165             return make_float16(float16_val(a) & 0x8000);
4166         }
4167     }
4168     return a;
4169 }
4170 
4171 /*----------------------------------------------------------------------------
4172 | Returns the result of converting the double-precision floating-point value
4173 | `a' to the extended double-precision floating-point format.  The conversion
4174 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4175 | Arithmetic.
4176 *----------------------------------------------------------------------------*/
4177 
4178 floatx80 float64_to_floatx80(float64 a, float_status *status)
4179 {
4180     flag aSign;
4181     int aExp;
4182     uint64_t aSig;
4183 
4184     a = float64_squash_input_denormal(a, status);
4185     aSig = extractFloat64Frac( a );
4186     aExp = extractFloat64Exp( a );
4187     aSign = extractFloat64Sign( a );
4188     if ( aExp == 0x7FF ) {
4189         if (aSig) {
4190             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
4191         }
4192         return packFloatx80(aSign,
4193                             floatx80_infinity_high,
4194                             floatx80_infinity_low);
4195     }
4196     if ( aExp == 0 ) {
4197         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4198         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4199     }
4200     return
4201         packFloatx80(
4202             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
4203 
4204 }
4205 
4206 /*----------------------------------------------------------------------------
4207 | Returns the result of converting the double-precision floating-point value
4208 | `a' to the quadruple-precision floating-point format.  The conversion is
4209 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4210 | Arithmetic.
4211 *----------------------------------------------------------------------------*/
4212 
4213 float128 float64_to_float128(float64 a, float_status *status)
4214 {
4215     flag aSign;
4216     int aExp;
4217     uint64_t aSig, zSig0, zSig1;
4218 
4219     a = float64_squash_input_denormal(a, status);
4220     aSig = extractFloat64Frac( a );
4221     aExp = extractFloat64Exp( a );
4222     aSign = extractFloat64Sign( a );
4223     if ( aExp == 0x7FF ) {
4224         if (aSig) {
4225             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
4226         }
4227         return packFloat128( aSign, 0x7FFF, 0, 0 );
4228     }
4229     if ( aExp == 0 ) {
4230         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4231         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4232         --aExp;
4233     }
4234     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
4235     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
4236 
4237 }
4238 
4239 
4240 /*----------------------------------------------------------------------------
4241 | Returns the remainder of the double-precision floating-point value `a'
4242 | with respect to the corresponding value `b'.  The operation is performed
4243 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4245 
4246 float64 float64_rem(float64 a, float64 b, float_status *status)
4247 {
4248     flag aSign, zSign;
4249     int aExp, bExp, expDiff;
4250     uint64_t aSig, bSig;
4251     uint64_t q, alternateASig;
4252     int64_t sigMean;
4253 
4254     a = float64_squash_input_denormal(a, status);
4255     b = float64_squash_input_denormal(b, status);
4256     aSig = extractFloat64Frac( a );
4257     aExp = extractFloat64Exp( a );
4258     aSign = extractFloat64Sign( a );
4259     bSig = extractFloat64Frac( b );
4260     bExp = extractFloat64Exp( b );
4261     if ( aExp == 0x7FF ) {
4262         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4263             return propagateFloat64NaN(a, b, status);
4264         }
4265         float_raise(float_flag_invalid, status);
4266         return float64_default_nan(status);
4267     }
4268     if ( bExp == 0x7FF ) {
4269         if (bSig) {
4270             return propagateFloat64NaN(a, b, status);
4271         }
4272         return a;
4273     }
4274     if ( bExp == 0 ) {
4275         if ( bSig == 0 ) {
4276             float_raise(float_flag_invalid, status);
4277             return float64_default_nan(status);
4278         }
4279         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4280     }
4281     if ( aExp == 0 ) {
4282         if ( aSig == 0 ) return a;
4283         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4284     }
4285     expDiff = aExp - bExp;
4286     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4287     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4288     if ( expDiff < 0 ) {
4289         if ( expDiff < -1 ) return a;
4290         aSig >>= 1;
4291     }
4292     q = ( bSig <= aSig );
4293     if ( q ) aSig -= bSig;
4294     expDiff -= 64;
4295     while ( 0 < expDiff ) {
4296         q = estimateDiv128To64( aSig, 0, bSig );
4297         q = ( 2 < q ) ? q - 2 : 0;
4298         aSig = - ( ( bSig>>2 ) * q );
4299         expDiff -= 62;
4300     }
4301     expDiff += 64;
4302     if ( 0 < expDiff ) {
4303         q = estimateDiv128To64( aSig, 0, bSig );
4304         q = ( 2 < q ) ? q - 2 : 0;
4305         q >>= 64 - expDiff;
4306         bSig >>= 2;
4307         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4308     }
4309     else {
4310         aSig >>= 2;
4311         bSig >>= 2;
4312     }
4313     do {
4314         alternateASig = aSig;
4315         ++q;
4316         aSig -= bSig;
4317     } while ( 0 <= (int64_t) aSig );
4318     sigMean = aSig + alternateASig;
4319     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4320         aSig = alternateASig;
4321     }
4322     zSign = ( (int64_t) aSig < 0 );
4323     if ( zSign ) aSig = - aSig;
4324     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4325 
4326 }
4327 
4328 /*----------------------------------------------------------------------------
4329 | Returns the binary log of the double-precision floating-point value `a'.
4330 | The operation is performed according to the IEC/IEEE Standard for Binary
4331 | Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333 float64 float64_log2(float64 a, float_status *status)
4334 {
4335     flag aSign, zSign;
4336     int aExp;
4337     uint64_t aSig, aSig0, aSig1, zSig, i;
4338     a = float64_squash_input_denormal(a, status);
4339 
4340     aSig = extractFloat64Frac( a );
4341     aExp = extractFloat64Exp( a );
4342     aSign = extractFloat64Sign( a );
4343 
4344     if ( aExp == 0 ) {
4345         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4346         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4347     }
4348     if ( aSign ) {
4349         float_raise(float_flag_invalid, status);
4350         return float64_default_nan(status);
4351     }
4352     if ( aExp == 0x7FF ) {
4353         if (aSig) {
4354             return propagateFloat64NaN(a, float64_zero, status);
4355         }
4356         return a;
4357     }
4358 
4359     aExp -= 0x3FF;
4360     aSig |= LIT64( 0x0010000000000000 );
4361     zSign = aExp < 0;
4362     zSig = (uint64_t)aExp << 52;
4363     for (i = 1LL << 51; i > 0; i >>= 1) {
4364         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4365         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4366         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4367             aSig >>= 1;
4368             zSig |= i;
4369         }
4370     }
4371 
4372     if ( zSign )
4373         zSig = -zSig;
4374     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4375 }
4376 
4377 /*----------------------------------------------------------------------------
4378 | Returns 1 if the double-precision floating-point value `a' is equal to the
4379 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4380 | if either operand is a NaN.  Otherwise, the comparison is performed
4381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4382 *----------------------------------------------------------------------------*/
4383 
4384 int float64_eq(float64 a, float64 b, float_status *status)
4385 {
4386     uint64_t av, bv;
4387     a = float64_squash_input_denormal(a, status);
4388     b = float64_squash_input_denormal(b, status);
4389 
4390     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4391          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4392        ) {
4393         float_raise(float_flag_invalid, status);
4394         return 0;
4395     }
4396     av = float64_val(a);
4397     bv = float64_val(b);
4398     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4399 
4400 }
4401 
4402 /*----------------------------------------------------------------------------
4403 | Returns 1 if the double-precision floating-point value `a' is less than or
4404 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4405 | exception is raised if either operand is a NaN.  The comparison is performed
4406 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4407 *----------------------------------------------------------------------------*/
4408 
4409 int float64_le(float64 a, float64 b, float_status *status)
4410 {
4411     flag aSign, bSign;
4412     uint64_t av, bv;
4413     a = float64_squash_input_denormal(a, status);
4414     b = float64_squash_input_denormal(b, status);
4415 
4416     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4417          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4418        ) {
4419         float_raise(float_flag_invalid, status);
4420         return 0;
4421     }
4422     aSign = extractFloat64Sign( a );
4423     bSign = extractFloat64Sign( b );
4424     av = float64_val(a);
4425     bv = float64_val(b);
4426     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4427     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4428 
4429 }
4430 
4431 /*----------------------------------------------------------------------------
4432 | Returns 1 if the double-precision floating-point value `a' is less than
4433 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4434 | raised if either operand is a NaN.  The comparison is performed according
4435 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4436 *----------------------------------------------------------------------------*/
4437 
4438 int float64_lt(float64 a, float64 b, float_status *status)
4439 {
4440     flag aSign, bSign;
4441     uint64_t av, bv;
4442 
4443     a = float64_squash_input_denormal(a, status);
4444     b = float64_squash_input_denormal(b, status);
4445     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4446          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4447        ) {
4448         float_raise(float_flag_invalid, status);
4449         return 0;
4450     }
4451     aSign = extractFloat64Sign( a );
4452     bSign = extractFloat64Sign( b );
4453     av = float64_val(a);
4454     bv = float64_val(b);
4455     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4456     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4457 
4458 }
4459 
4460 /*----------------------------------------------------------------------------
4461 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4462 | be compared, and 0 otherwise.  The invalid exception is raised if either
4463 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4464 | Standard for Binary Floating-Point Arithmetic.
4465 *----------------------------------------------------------------------------*/
4466 
4467 int float64_unordered(float64 a, float64 b, float_status *status)
4468 {
4469     a = float64_squash_input_denormal(a, status);
4470     b = float64_squash_input_denormal(b, status);
4471 
4472     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4473          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4474        ) {
4475         float_raise(float_flag_invalid, status);
4476         return 1;
4477     }
4478     return 0;
4479 }
4480 
4481 /*----------------------------------------------------------------------------
4482 | Returns 1 if the double-precision floating-point value `a' is equal to the
4483 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4484 | exception.The comparison is performed according to the IEC/IEEE Standard
4485 | for Binary Floating-Point Arithmetic.
4486 *----------------------------------------------------------------------------*/
4487 
4488 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4489 {
4490     uint64_t av, bv;
4491     a = float64_squash_input_denormal(a, status);
4492     b = float64_squash_input_denormal(b, status);
4493 
4494     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4495          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4496        ) {
4497         if (float64_is_signaling_nan(a, status)
4498          || float64_is_signaling_nan(b, status)) {
4499             float_raise(float_flag_invalid, status);
4500         }
4501         return 0;
4502     }
4503     av = float64_val(a);
4504     bv = float64_val(b);
4505     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4506 
4507 }
4508 
4509 /*----------------------------------------------------------------------------
4510 | Returns 1 if the double-precision floating-point value `a' is less than or
4511 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4512 | cause an exception.  Otherwise, the comparison is performed according to the
4513 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4514 *----------------------------------------------------------------------------*/
4515 
4516 int float64_le_quiet(float64 a, float64 b, float_status *status)
4517 {
4518     flag aSign, bSign;
4519     uint64_t av, bv;
4520     a = float64_squash_input_denormal(a, status);
4521     b = float64_squash_input_denormal(b, status);
4522 
4523     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4524          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4525        ) {
4526         if (float64_is_signaling_nan(a, status)
4527          || float64_is_signaling_nan(b, status)) {
4528             float_raise(float_flag_invalid, status);
4529         }
4530         return 0;
4531     }
4532     aSign = extractFloat64Sign( a );
4533     bSign = extractFloat64Sign( b );
4534     av = float64_val(a);
4535     bv = float64_val(b);
4536     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4537     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4538 
4539 }
4540 
4541 /*----------------------------------------------------------------------------
4542 | Returns 1 if the double-precision floating-point value `a' is less than
4543 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4544 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4545 | Standard for Binary Floating-Point Arithmetic.
4546 *----------------------------------------------------------------------------*/
4547 
4548 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4549 {
4550     flag aSign, bSign;
4551     uint64_t av, bv;
4552     a = float64_squash_input_denormal(a, status);
4553     b = float64_squash_input_denormal(b, status);
4554 
4555     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4556          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4557        ) {
4558         if (float64_is_signaling_nan(a, status)
4559          || float64_is_signaling_nan(b, status)) {
4560             float_raise(float_flag_invalid, status);
4561         }
4562         return 0;
4563     }
4564     aSign = extractFloat64Sign( a );
4565     bSign = extractFloat64Sign( b );
4566     av = float64_val(a);
4567     bv = float64_val(b);
4568     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4569     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4570 
4571 }
4572 
4573 /*----------------------------------------------------------------------------
4574 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4575 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4576 | comparison is performed according to the IEC/IEEE Standard for Binary
4577 | Floating-Point Arithmetic.
4578 *----------------------------------------------------------------------------*/
4579 
4580 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4581 {
4582     a = float64_squash_input_denormal(a, status);
4583     b = float64_squash_input_denormal(b, status);
4584 
4585     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4586          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4587        ) {
4588         if (float64_is_signaling_nan(a, status)
4589          || float64_is_signaling_nan(b, status)) {
4590             float_raise(float_flag_invalid, status);
4591         }
4592         return 1;
4593     }
4594     return 0;
4595 }
4596 
4597 /*----------------------------------------------------------------------------
4598 | Returns the result of converting the extended double-precision floating-
4599 | point value `a' to the 32-bit two's complement integer format.  The
4600 | conversion is performed according to the IEC/IEEE Standard for Binary
4601 | Floating-Point Arithmetic---which means in particular that the conversion
4602 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4603 | largest positive integer is returned.  Otherwise, if the conversion
4604 | overflows, the largest integer with the same sign as `a' is returned.
4605 *----------------------------------------------------------------------------*/
4606 
4607 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4608 {
4609     flag aSign;
4610     int32_t aExp, shiftCount;
4611     uint64_t aSig;
4612 
4613     if (floatx80_invalid_encoding(a)) {
4614         float_raise(float_flag_invalid, status);
4615         return 1 << 31;
4616     }
4617     aSig = extractFloatx80Frac( a );
4618     aExp = extractFloatx80Exp( a );
4619     aSign = extractFloatx80Sign( a );
4620     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4621     shiftCount = 0x4037 - aExp;
4622     if ( shiftCount <= 0 ) shiftCount = 1;
4623     shift64RightJamming( aSig, shiftCount, &aSig );
4624     return roundAndPackInt32(aSign, aSig, status);
4625 
4626 }
4627 
4628 /*----------------------------------------------------------------------------
4629 | Returns the result of converting the extended double-precision floating-
4630 | point value `a' to the 32-bit two's complement integer format.  The
4631 | conversion is performed according to the IEC/IEEE Standard for Binary
4632 | Floating-Point Arithmetic, except that the conversion is always rounded
4633 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4634 | Otherwise, if the conversion overflows, the largest integer with the same
4635 | sign as `a' is returned.
4636 *----------------------------------------------------------------------------*/
4637 
4638 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4639 {
4640     flag aSign;
4641     int32_t aExp, shiftCount;
4642     uint64_t aSig, savedASig;
4643     int32_t z;
4644 
4645     if (floatx80_invalid_encoding(a)) {
4646         float_raise(float_flag_invalid, status);
4647         return 1 << 31;
4648     }
4649     aSig = extractFloatx80Frac( a );
4650     aExp = extractFloatx80Exp( a );
4651     aSign = extractFloatx80Sign( a );
4652     if ( 0x401E < aExp ) {
4653         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4654         goto invalid;
4655     }
4656     else if ( aExp < 0x3FFF ) {
4657         if (aExp || aSig) {
4658             status->float_exception_flags |= float_flag_inexact;
4659         }
4660         return 0;
4661     }
4662     shiftCount = 0x403E - aExp;
4663     savedASig = aSig;
4664     aSig >>= shiftCount;
4665     z = aSig;
4666     if ( aSign ) z = - z;
4667     if ( ( z < 0 ) ^ aSign ) {
4668  invalid:
4669         float_raise(float_flag_invalid, status);
4670         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4671     }
4672     if ( ( aSig<<shiftCount ) != savedASig ) {
4673         status->float_exception_flags |= float_flag_inexact;
4674     }
4675     return z;
4676 
4677 }
4678 
4679 /*----------------------------------------------------------------------------
4680 | Returns the result of converting the extended double-precision floating-
4681 | point value `a' to the 64-bit two's complement integer format.  The
4682 | conversion is performed according to the IEC/IEEE Standard for Binary
4683 | Floating-Point Arithmetic---which means in particular that the conversion
4684 | is rounded according to the current rounding mode.  If `a' is a NaN,
4685 | the largest positive integer is returned.  Otherwise, if the conversion
4686 | overflows, the largest integer with the same sign as `a' is returned.
4687 *----------------------------------------------------------------------------*/
4688 
4689 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4690 {
4691     flag aSign;
4692     int32_t aExp, shiftCount;
4693     uint64_t aSig, aSigExtra;
4694 
4695     if (floatx80_invalid_encoding(a)) {
4696         float_raise(float_flag_invalid, status);
4697         return 1ULL << 63;
4698     }
4699     aSig = extractFloatx80Frac( a );
4700     aExp = extractFloatx80Exp( a );
4701     aSign = extractFloatx80Sign( a );
4702     shiftCount = 0x403E - aExp;
4703     if ( shiftCount <= 0 ) {
4704         if ( shiftCount ) {
4705             float_raise(float_flag_invalid, status);
4706             if (!aSign || floatx80_is_any_nan(a)) {
4707                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4708             }
4709             return (int64_t) LIT64( 0x8000000000000000 );
4710         }
4711         aSigExtra = 0;
4712     }
4713     else {
4714         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4715     }
4716     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4717 
4718 }
4719 
4720 /*----------------------------------------------------------------------------
4721 | Returns the result of converting the extended double-precision floating-
4722 | point value `a' to the 64-bit two's complement integer format.  The
4723 | conversion is performed according to the IEC/IEEE Standard for Binary
4724 | Floating-Point Arithmetic, except that the conversion is always rounded
4725 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4726 | Otherwise, if the conversion overflows, the largest integer with the same
4727 | sign as `a' is returned.
4728 *----------------------------------------------------------------------------*/
4729 
4730 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4731 {
4732     flag aSign;
4733     int32_t aExp, shiftCount;
4734     uint64_t aSig;
4735     int64_t z;
4736 
4737     if (floatx80_invalid_encoding(a)) {
4738         float_raise(float_flag_invalid, status);
4739         return 1ULL << 63;
4740     }
4741     aSig = extractFloatx80Frac( a );
4742     aExp = extractFloatx80Exp( a );
4743     aSign = extractFloatx80Sign( a );
4744     shiftCount = aExp - 0x403E;
4745     if ( 0 <= shiftCount ) {
4746         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4747         if ( ( a.high != 0xC03E ) || aSig ) {
4748             float_raise(float_flag_invalid, status);
4749             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4750                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4751             }
4752         }
4753         return (int64_t) LIT64( 0x8000000000000000 );
4754     }
4755     else if ( aExp < 0x3FFF ) {
4756         if (aExp | aSig) {
4757             status->float_exception_flags |= float_flag_inexact;
4758         }
4759         return 0;
4760     }
4761     z = aSig>>( - shiftCount );
4762     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4763         status->float_exception_flags |= float_flag_inexact;
4764     }
4765     if ( aSign ) z = - z;
4766     return z;
4767 
4768 }
4769 
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the extended double-precision floating-
4772 | point value `a' to the single-precision floating-point format.  The
4773 | conversion is performed according to the IEC/IEEE Standard for Binary
4774 | Floating-Point Arithmetic.
4775 *----------------------------------------------------------------------------*/
4776 
4777 float32 floatx80_to_float32(floatx80 a, float_status *status)
4778 {
4779     flag aSign;
4780     int32_t aExp;
4781     uint64_t aSig;
4782 
4783     if (floatx80_invalid_encoding(a)) {
4784         float_raise(float_flag_invalid, status);
4785         return float32_default_nan(status);
4786     }
4787     aSig = extractFloatx80Frac( a );
4788     aExp = extractFloatx80Exp( a );
4789     aSign = extractFloatx80Sign( a );
4790     if ( aExp == 0x7FFF ) {
4791         if ( (uint64_t) ( aSig<<1 ) ) {
4792             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4793         }
4794         return packFloat32( aSign, 0xFF, 0 );
4795     }
4796     shift64RightJamming( aSig, 33, &aSig );
4797     if ( aExp || aSig ) aExp -= 0x3F81;
4798     return roundAndPackFloat32(aSign, aExp, aSig, status);
4799 
4800 }
4801 
4802 /*----------------------------------------------------------------------------
4803 | Returns the result of converting the extended double-precision floating-
4804 | point value `a' to the double-precision floating-point format.  The
4805 | conversion is performed according to the IEC/IEEE Standard for Binary
4806 | Floating-Point Arithmetic.
4807 *----------------------------------------------------------------------------*/
4808 
4809 float64 floatx80_to_float64(floatx80 a, float_status *status)
4810 {
4811     flag aSign;
4812     int32_t aExp;
4813     uint64_t aSig, zSig;
4814 
4815     if (floatx80_invalid_encoding(a)) {
4816         float_raise(float_flag_invalid, status);
4817         return float64_default_nan(status);
4818     }
4819     aSig = extractFloatx80Frac( a );
4820     aExp = extractFloatx80Exp( a );
4821     aSign = extractFloatx80Sign( a );
4822     if ( aExp == 0x7FFF ) {
4823         if ( (uint64_t) ( aSig<<1 ) ) {
4824             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4825         }
4826         return packFloat64( aSign, 0x7FF, 0 );
4827     }
4828     shift64RightJamming( aSig, 1, &zSig );
4829     if ( aExp || aSig ) aExp -= 0x3C01;
4830     return roundAndPackFloat64(aSign, aExp, zSig, status);
4831 
4832 }
4833 
4834 /*----------------------------------------------------------------------------
4835 | Returns the result of converting the extended double-precision floating-
4836 | point value `a' to the quadruple-precision floating-point format.  The
4837 | conversion is performed according to the IEC/IEEE Standard for Binary
4838 | Floating-Point Arithmetic.
4839 *----------------------------------------------------------------------------*/
4840 
4841 float128 floatx80_to_float128(floatx80 a, float_status *status)
4842 {
4843     flag aSign;
4844     int aExp;
4845     uint64_t aSig, zSig0, zSig1;
4846 
4847     if (floatx80_invalid_encoding(a)) {
4848         float_raise(float_flag_invalid, status);
4849         return float128_default_nan(status);
4850     }
4851     aSig = extractFloatx80Frac( a );
4852     aExp = extractFloatx80Exp( a );
4853     aSign = extractFloatx80Sign( a );
4854     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4855         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
4856     }
4857     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4858     return packFloat128( aSign, aExp, zSig0, zSig1 );
4859 
4860 }
4861 
4862 /*----------------------------------------------------------------------------
4863 | Rounds the extended double-precision floating-point value `a'
4864 | to the precision provided by floatx80_rounding_precision and returns the
4865 | result as an extended double-precision floating-point value.
4866 | The operation is performed according to the IEC/IEEE Standard for Binary
4867 | Floating-Point Arithmetic.
4868 *----------------------------------------------------------------------------*/
4869 
4870 floatx80 floatx80_round(floatx80 a, float_status *status)
4871 {
4872     return roundAndPackFloatx80(status->floatx80_rounding_precision,
4873                                 extractFloatx80Sign(a),
4874                                 extractFloatx80Exp(a),
4875                                 extractFloatx80Frac(a), 0, status);
4876 }
4877 
4878 /*----------------------------------------------------------------------------
4879 | Rounds the extended double-precision floating-point value `a' to an integer,
4880 | and returns the result as an extended quadruple-precision floating-point
4881 | value.  The operation is performed according to the IEC/IEEE Standard for
4882 | Binary Floating-Point Arithmetic.
4883 *----------------------------------------------------------------------------*/
4884 
4885 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
4886 {
4887     flag aSign;
4888     int32_t aExp;
4889     uint64_t lastBitMask, roundBitsMask;
4890     floatx80 z;
4891 
4892     if (floatx80_invalid_encoding(a)) {
4893         float_raise(float_flag_invalid, status);
4894         return floatx80_default_nan(status);
4895     }
4896     aExp = extractFloatx80Exp( a );
4897     if ( 0x403E <= aExp ) {
4898         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4899             return propagateFloatx80NaN(a, a, status);
4900         }
4901         return a;
4902     }
4903     if ( aExp < 0x3FFF ) {
4904         if (    ( aExp == 0 )
4905              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4906             return a;
4907         }
4908         status->float_exception_flags |= float_flag_inexact;
4909         aSign = extractFloatx80Sign( a );
4910         switch (status->float_rounding_mode) {
4911          case float_round_nearest_even:
4912             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4913                ) {
4914                 return
4915                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4916             }
4917             break;
4918         case float_round_ties_away:
4919             if (aExp == 0x3FFE) {
4920                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4921             }
4922             break;
4923          case float_round_down:
4924             return
4925                   aSign ?
4926                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4927                 : packFloatx80( 0, 0, 0 );
4928          case float_round_up:
4929             return
4930                   aSign ? packFloatx80( 1, 0, 0 )
4931                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4932         }
4933         return packFloatx80( aSign, 0, 0 );
4934     }
4935     lastBitMask = 1;
4936     lastBitMask <<= 0x403E - aExp;
4937     roundBitsMask = lastBitMask - 1;
4938     z = a;
4939     switch (status->float_rounding_mode) {
4940     case float_round_nearest_even:
4941         z.low += lastBitMask>>1;
4942         if ((z.low & roundBitsMask) == 0) {
4943             z.low &= ~lastBitMask;
4944         }
4945         break;
4946     case float_round_ties_away:
4947         z.low += lastBitMask >> 1;
4948         break;
4949     case float_round_to_zero:
4950         break;
4951     case float_round_up:
4952         if (!extractFloatx80Sign(z)) {
4953             z.low += roundBitsMask;
4954         }
4955         break;
4956     case float_round_down:
4957         if (extractFloatx80Sign(z)) {
4958             z.low += roundBitsMask;
4959         }
4960         break;
4961     default:
4962         abort();
4963     }
4964     z.low &= ~ roundBitsMask;
4965     if ( z.low == 0 ) {
4966         ++z.high;
4967         z.low = LIT64( 0x8000000000000000 );
4968     }
4969     if (z.low != a.low) {
4970         status->float_exception_flags |= float_flag_inexact;
4971     }
4972     return z;
4973 
4974 }
4975 
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of adding the absolute values of the extended double-
4978 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4979 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4980 | The addition is performed according to the IEC/IEEE Standard for Binary
4981 | Floating-Point Arithmetic.
4982 *----------------------------------------------------------------------------*/
4983 
4984 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
4985                                 float_status *status)
4986 {
4987     int32_t aExp, bExp, zExp;
4988     uint64_t aSig, bSig, zSig0, zSig1;
4989     int32_t expDiff;
4990 
4991     aSig = extractFloatx80Frac( a );
4992     aExp = extractFloatx80Exp( a );
4993     bSig = extractFloatx80Frac( b );
4994     bExp = extractFloatx80Exp( b );
4995     expDiff = aExp - bExp;
4996     if ( 0 < expDiff ) {
4997         if ( aExp == 0x7FFF ) {
4998             if ((uint64_t)(aSig << 1)) {
4999                 return propagateFloatx80NaN(a, b, status);
5000             }
5001             return a;
5002         }
5003         if ( bExp == 0 ) --expDiff;
5004         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5005         zExp = aExp;
5006     }
5007     else if ( expDiff < 0 ) {
5008         if ( bExp == 0x7FFF ) {
5009             if ((uint64_t)(bSig << 1)) {
5010                 return propagateFloatx80NaN(a, b, status);
5011             }
5012             return packFloatx80(zSign,
5013                                 floatx80_infinity_high,
5014                                 floatx80_infinity_low);
5015         }
5016         if ( aExp == 0 ) ++expDiff;
5017         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5018         zExp = bExp;
5019     }
5020     else {
5021         if ( aExp == 0x7FFF ) {
5022             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5023                 return propagateFloatx80NaN(a, b, status);
5024             }
5025             return a;
5026         }
5027         zSig1 = 0;
5028         zSig0 = aSig + bSig;
5029         if ( aExp == 0 ) {
5030             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5031             goto roundAndPack;
5032         }
5033         zExp = aExp;
5034         goto shiftRight1;
5035     }
5036     zSig0 = aSig + bSig;
5037     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5038  shiftRight1:
5039     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5040     zSig0 |= LIT64( 0x8000000000000000 );
5041     ++zExp;
5042  roundAndPack:
5043     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5044                                 zSign, zExp, zSig0, zSig1, status);
5045 }
5046 
5047 /*----------------------------------------------------------------------------
5048 | Returns the result of subtracting the absolute values of the extended
5049 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5050 | difference is negated before being returned.  `zSign' is ignored if the
5051 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5052 | Standard for Binary Floating-Point Arithmetic.
5053 *----------------------------------------------------------------------------*/
5054 
5055 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5056                                 float_status *status)
5057 {
5058     int32_t aExp, bExp, zExp;
5059     uint64_t aSig, bSig, zSig0, zSig1;
5060     int32_t expDiff;
5061 
5062     aSig = extractFloatx80Frac( a );
5063     aExp = extractFloatx80Exp( a );
5064     bSig = extractFloatx80Frac( b );
5065     bExp = extractFloatx80Exp( b );
5066     expDiff = aExp - bExp;
5067     if ( 0 < expDiff ) goto aExpBigger;
5068     if ( expDiff < 0 ) goto bExpBigger;
5069     if ( aExp == 0x7FFF ) {
5070         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5071             return propagateFloatx80NaN(a, b, status);
5072         }
5073         float_raise(float_flag_invalid, status);
5074         return floatx80_default_nan(status);
5075     }
5076     if ( aExp == 0 ) {
5077         aExp = 1;
5078         bExp = 1;
5079     }
5080     zSig1 = 0;
5081     if ( bSig < aSig ) goto aBigger;
5082     if ( aSig < bSig ) goto bBigger;
5083     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5084  bExpBigger:
5085     if ( bExp == 0x7FFF ) {
5086         if ((uint64_t)(bSig << 1)) {
5087             return propagateFloatx80NaN(a, b, status);
5088         }
5089         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5090                             floatx80_infinity_low);
5091     }
5092     if ( aExp == 0 ) ++expDiff;
5093     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5094  bBigger:
5095     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5096     zExp = bExp;
5097     zSign ^= 1;
5098     goto normalizeRoundAndPack;
5099  aExpBigger:
5100     if ( aExp == 0x7FFF ) {
5101         if ((uint64_t)(aSig << 1)) {
5102             return propagateFloatx80NaN(a, b, status);
5103         }
5104         return a;
5105     }
5106     if ( bExp == 0 ) --expDiff;
5107     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5108  aBigger:
5109     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5110     zExp = aExp;
5111  normalizeRoundAndPack:
5112     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5113                                          zSign, zExp, zSig0, zSig1, status);
5114 }
5115 
5116 /*----------------------------------------------------------------------------
5117 | Returns the result of adding the extended double-precision floating-point
5118 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5119 | Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121 
5122 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5123 {
5124     flag aSign, bSign;
5125 
5126     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5127         float_raise(float_flag_invalid, status);
5128         return floatx80_default_nan(status);
5129     }
5130     aSign = extractFloatx80Sign( a );
5131     bSign = extractFloatx80Sign( b );
5132     if ( aSign == bSign ) {
5133         return addFloatx80Sigs(a, b, aSign, status);
5134     }
5135     else {
5136         return subFloatx80Sigs(a, b, aSign, status);
5137     }
5138 
5139 }
5140 
5141 /*----------------------------------------------------------------------------
5142 | Returns the result of subtracting the extended double-precision floating-
5143 | point values `a' and `b'.  The operation is performed according to the
5144 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5145 *----------------------------------------------------------------------------*/
5146 
5147 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5148 {
5149     flag aSign, bSign;
5150 
5151     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5152         float_raise(float_flag_invalid, status);
5153         return floatx80_default_nan(status);
5154     }
5155     aSign = extractFloatx80Sign( a );
5156     bSign = extractFloatx80Sign( b );
5157     if ( aSign == bSign ) {
5158         return subFloatx80Sigs(a, b, aSign, status);
5159     }
5160     else {
5161         return addFloatx80Sigs(a, b, aSign, status);
5162     }
5163 
5164 }
5165 
5166 /*----------------------------------------------------------------------------
5167 | Returns the result of multiplying the extended double-precision floating-
5168 | point values `a' and `b'.  The operation is performed according to the
5169 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5170 *----------------------------------------------------------------------------*/
5171 
5172 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5173 {
5174     flag aSign, bSign, zSign;
5175     int32_t aExp, bExp, zExp;
5176     uint64_t aSig, bSig, zSig0, zSig1;
5177 
5178     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5179         float_raise(float_flag_invalid, status);
5180         return floatx80_default_nan(status);
5181     }
5182     aSig = extractFloatx80Frac( a );
5183     aExp = extractFloatx80Exp( a );
5184     aSign = extractFloatx80Sign( a );
5185     bSig = extractFloatx80Frac( b );
5186     bExp = extractFloatx80Exp( b );
5187     bSign = extractFloatx80Sign( b );
5188     zSign = aSign ^ bSign;
5189     if ( aExp == 0x7FFF ) {
5190         if (    (uint64_t) ( aSig<<1 )
5191              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5192             return propagateFloatx80NaN(a, b, status);
5193         }
5194         if ( ( bExp | bSig ) == 0 ) goto invalid;
5195         return packFloatx80(zSign, floatx80_infinity_high,
5196                                    floatx80_infinity_low);
5197     }
5198     if ( bExp == 0x7FFF ) {
5199         if ((uint64_t)(bSig << 1)) {
5200             return propagateFloatx80NaN(a, b, status);
5201         }
5202         if ( ( aExp | aSig ) == 0 ) {
5203  invalid:
5204             float_raise(float_flag_invalid, status);
5205             return floatx80_default_nan(status);
5206         }
5207         return packFloatx80(zSign, floatx80_infinity_high,
5208                                    floatx80_infinity_low);
5209     }
5210     if ( aExp == 0 ) {
5211         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5213     }
5214     if ( bExp == 0 ) {
5215         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5216         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5217     }
5218     zExp = aExp + bExp - 0x3FFE;
5219     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5220     if ( 0 < (int64_t) zSig0 ) {
5221         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5222         --zExp;
5223     }
5224     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5225                                 zSign, zExp, zSig0, zSig1, status);
5226 }
5227 
5228 /*----------------------------------------------------------------------------
5229 | Returns the result of dividing the extended double-precision floating-point
5230 | value `a' by the corresponding value `b'.  The operation is performed
5231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5232 *----------------------------------------------------------------------------*/
5233 
5234 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5235 {
5236     flag aSign, bSign, zSign;
5237     int32_t aExp, bExp, zExp;
5238     uint64_t aSig, bSig, zSig0, zSig1;
5239     uint64_t rem0, rem1, rem2, term0, term1, term2;
5240 
5241     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5242         float_raise(float_flag_invalid, status);
5243         return floatx80_default_nan(status);
5244     }
5245     aSig = extractFloatx80Frac( a );
5246     aExp = extractFloatx80Exp( a );
5247     aSign = extractFloatx80Sign( a );
5248     bSig = extractFloatx80Frac( b );
5249     bExp = extractFloatx80Exp( b );
5250     bSign = extractFloatx80Sign( b );
5251     zSign = aSign ^ bSign;
5252     if ( aExp == 0x7FFF ) {
5253         if ((uint64_t)(aSig << 1)) {
5254             return propagateFloatx80NaN(a, b, status);
5255         }
5256         if ( bExp == 0x7FFF ) {
5257             if ((uint64_t)(bSig << 1)) {
5258                 return propagateFloatx80NaN(a, b, status);
5259             }
5260             goto invalid;
5261         }
5262         return packFloatx80(zSign, floatx80_infinity_high,
5263                                    floatx80_infinity_low);
5264     }
5265     if ( bExp == 0x7FFF ) {
5266         if ((uint64_t)(bSig << 1)) {
5267             return propagateFloatx80NaN(a, b, status);
5268         }
5269         return packFloatx80( zSign, 0, 0 );
5270     }
5271     if ( bExp == 0 ) {
5272         if ( bSig == 0 ) {
5273             if ( ( aExp | aSig ) == 0 ) {
5274  invalid:
5275                 float_raise(float_flag_invalid, status);
5276                 return floatx80_default_nan(status);
5277             }
5278             float_raise(float_flag_divbyzero, status);
5279             return packFloatx80(zSign, floatx80_infinity_high,
5280                                        floatx80_infinity_low);
5281         }
5282         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5283     }
5284     if ( aExp == 0 ) {
5285         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5286         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5287     }
5288     zExp = aExp - bExp + 0x3FFE;
5289     rem1 = 0;
5290     if ( bSig <= aSig ) {
5291         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5292         ++zExp;
5293     }
5294     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5295     mul64To128( bSig, zSig0, &term0, &term1 );
5296     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5297     while ( (int64_t) rem0 < 0 ) {
5298         --zSig0;
5299         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5300     }
5301     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5302     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5303         mul64To128( bSig, zSig1, &term1, &term2 );
5304         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5305         while ( (int64_t) rem1 < 0 ) {
5306             --zSig1;
5307             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5308         }
5309         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5310     }
5311     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5312                                 zSign, zExp, zSig0, zSig1, status);
5313 }
5314 
5315 /*----------------------------------------------------------------------------
5316 | Returns the remainder of the extended double-precision floating-point value
5317 | `a' with respect to the corresponding value `b'.  The operation is performed
5318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5319 *----------------------------------------------------------------------------*/
5320 
5321 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5322 {
5323     flag aSign, zSign;
5324     int32_t aExp, bExp, expDiff;
5325     uint64_t aSig0, aSig1, bSig;
5326     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5327 
5328     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5329         float_raise(float_flag_invalid, status);
5330         return floatx80_default_nan(status);
5331     }
5332     aSig0 = extractFloatx80Frac( a );
5333     aExp = extractFloatx80Exp( a );
5334     aSign = extractFloatx80Sign( a );
5335     bSig = extractFloatx80Frac( b );
5336     bExp = extractFloatx80Exp( b );
5337     if ( aExp == 0x7FFF ) {
5338         if (    (uint64_t) ( aSig0<<1 )
5339              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5340             return propagateFloatx80NaN(a, b, status);
5341         }
5342         goto invalid;
5343     }
5344     if ( bExp == 0x7FFF ) {
5345         if ((uint64_t)(bSig << 1)) {
5346             return propagateFloatx80NaN(a, b, status);
5347         }
5348         return a;
5349     }
5350     if ( bExp == 0 ) {
5351         if ( bSig == 0 ) {
5352  invalid:
5353             float_raise(float_flag_invalid, status);
5354             return floatx80_default_nan(status);
5355         }
5356         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5357     }
5358     if ( aExp == 0 ) {
5359         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5360         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5361     }
5362     bSig |= LIT64( 0x8000000000000000 );
5363     zSign = aSign;
5364     expDiff = aExp - bExp;
5365     aSig1 = 0;
5366     if ( expDiff < 0 ) {
5367         if ( expDiff < -1 ) return a;
5368         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5369         expDiff = 0;
5370     }
5371     q = ( bSig <= aSig0 );
5372     if ( q ) aSig0 -= bSig;
5373     expDiff -= 64;
5374     while ( 0 < expDiff ) {
5375         q = estimateDiv128To64( aSig0, aSig1, bSig );
5376         q = ( 2 < q ) ? q - 2 : 0;
5377         mul64To128( bSig, q, &term0, &term1 );
5378         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5379         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5380         expDiff -= 62;
5381     }
5382     expDiff += 64;
5383     if ( 0 < expDiff ) {
5384         q = estimateDiv128To64( aSig0, aSig1, bSig );
5385         q = ( 2 < q ) ? q - 2 : 0;
5386         q >>= 64 - expDiff;
5387         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5388         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5389         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5390         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5391             ++q;
5392             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5393         }
5394     }
5395     else {
5396         term1 = 0;
5397         term0 = bSig;
5398     }
5399     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5400     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5401          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5402               && ( q & 1 ) )
5403        ) {
5404         aSig0 = alternateASig0;
5405         aSig1 = alternateASig1;
5406         zSign = ! zSign;
5407     }
5408     return
5409         normalizeRoundAndPackFloatx80(
5410             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5411 
5412 }
5413 
5414 /*----------------------------------------------------------------------------
5415 | Returns the square root of the extended double-precision floating-point
5416 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5417 | for Binary Floating-Point Arithmetic.
5418 *----------------------------------------------------------------------------*/
5419 
5420 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5421 {
5422     flag aSign;
5423     int32_t aExp, zExp;
5424     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5425     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5426 
5427     if (floatx80_invalid_encoding(a)) {
5428         float_raise(float_flag_invalid, status);
5429         return floatx80_default_nan(status);
5430     }
5431     aSig0 = extractFloatx80Frac( a );
5432     aExp = extractFloatx80Exp( a );
5433     aSign = extractFloatx80Sign( a );
5434     if ( aExp == 0x7FFF ) {
5435         if ((uint64_t)(aSig0 << 1)) {
5436             return propagateFloatx80NaN(a, a, status);
5437         }
5438         if ( ! aSign ) return a;
5439         goto invalid;
5440     }
5441     if ( aSign ) {
5442         if ( ( aExp | aSig0 ) == 0 ) return a;
5443  invalid:
5444         float_raise(float_flag_invalid, status);
5445         return floatx80_default_nan(status);
5446     }
5447     if ( aExp == 0 ) {
5448         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5449         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5450     }
5451     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5452     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5453     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5454     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5455     doubleZSig0 = zSig0<<1;
5456     mul64To128( zSig0, zSig0, &term0, &term1 );
5457     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5458     while ( (int64_t) rem0 < 0 ) {
5459         --zSig0;
5460         doubleZSig0 -= 2;
5461         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5462     }
5463     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5464     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5465         if ( zSig1 == 0 ) zSig1 = 1;
5466         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5467         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5468         mul64To128( zSig1, zSig1, &term2, &term3 );
5469         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5470         while ( (int64_t) rem1 < 0 ) {
5471             --zSig1;
5472             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5473             term3 |= 1;
5474             term2 |= doubleZSig0;
5475             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5476         }
5477         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5478     }
5479     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5480     zSig0 |= doubleZSig0;
5481     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5482                                 0, zExp, zSig0, zSig1, status);
5483 }
5484 
5485 /*----------------------------------------------------------------------------
5486 | Returns 1 if the extended double-precision floating-point value `a' is equal
5487 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5488 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5489 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5490 *----------------------------------------------------------------------------*/
5491 
5492 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5493 {
5494 
5495     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5496         || (extractFloatx80Exp(a) == 0x7FFF
5497             && (uint64_t) (extractFloatx80Frac(a) << 1))
5498         || (extractFloatx80Exp(b) == 0x7FFF
5499             && (uint64_t) (extractFloatx80Frac(b) << 1))
5500        ) {
5501         float_raise(float_flag_invalid, status);
5502         return 0;
5503     }
5504     return
5505            ( a.low == b.low )
5506         && (    ( a.high == b.high )
5507              || (    ( a.low == 0 )
5508                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5509            );
5510 
5511 }
5512 
5513 /*----------------------------------------------------------------------------
5514 | Returns 1 if the extended double-precision floating-point value `a' is
5515 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5516 | invalid exception is raised if either operand is a NaN.  The comparison is
5517 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5518 | Arithmetic.
5519 *----------------------------------------------------------------------------*/
5520 
5521 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5522 {
5523     flag aSign, bSign;
5524 
5525     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5526         || (extractFloatx80Exp(a) == 0x7FFF
5527             && (uint64_t) (extractFloatx80Frac(a) << 1))
5528         || (extractFloatx80Exp(b) == 0x7FFF
5529             && (uint64_t) (extractFloatx80Frac(b) << 1))
5530        ) {
5531         float_raise(float_flag_invalid, status);
5532         return 0;
5533     }
5534     aSign = extractFloatx80Sign( a );
5535     bSign = extractFloatx80Sign( b );
5536     if ( aSign != bSign ) {
5537         return
5538                aSign
5539             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5540                  == 0 );
5541     }
5542     return
5543           aSign ? le128( b.high, b.low, a.high, a.low )
5544         : le128( a.high, a.low, b.high, b.low );
5545 
5546 }
5547 
5548 /*----------------------------------------------------------------------------
5549 | Returns 1 if the extended double-precision floating-point value `a' is
5550 | less than the corresponding value `b', and 0 otherwise.  The invalid
5551 | exception is raised if either operand is a NaN.  The comparison is performed
5552 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5553 *----------------------------------------------------------------------------*/
5554 
5555 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5556 {
5557     flag aSign, bSign;
5558 
5559     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5560         || (extractFloatx80Exp(a) == 0x7FFF
5561             && (uint64_t) (extractFloatx80Frac(a) << 1))
5562         || (extractFloatx80Exp(b) == 0x7FFF
5563             && (uint64_t) (extractFloatx80Frac(b) << 1))
5564        ) {
5565         float_raise(float_flag_invalid, status);
5566         return 0;
5567     }
5568     aSign = extractFloatx80Sign( a );
5569     bSign = extractFloatx80Sign( b );
5570     if ( aSign != bSign ) {
5571         return
5572                aSign
5573             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5574                  != 0 );
5575     }
5576     return
5577           aSign ? lt128( b.high, b.low, a.high, a.low )
5578         : lt128( a.high, a.low, b.high, b.low );
5579 
5580 }
5581 
5582 /*----------------------------------------------------------------------------
5583 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5584 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5585 | either operand is a NaN.   The comparison is performed according to the
5586 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5587 *----------------------------------------------------------------------------*/
5588 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5589 {
5590     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5591         || (extractFloatx80Exp(a) == 0x7FFF
5592             && (uint64_t) (extractFloatx80Frac(a) << 1))
5593         || (extractFloatx80Exp(b) == 0x7FFF
5594             && (uint64_t) (extractFloatx80Frac(b) << 1))
5595        ) {
5596         float_raise(float_flag_invalid, status);
5597         return 1;
5598     }
5599     return 0;
5600 }
5601 
5602 /*----------------------------------------------------------------------------
5603 | Returns 1 if the extended double-precision floating-point value `a' is
5604 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5605 | cause an exception.  The comparison is performed according to the IEC/IEEE
5606 | Standard for Binary Floating-Point Arithmetic.
5607 *----------------------------------------------------------------------------*/
5608 
5609 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5610 {
5611 
5612     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5613         float_raise(float_flag_invalid, status);
5614         return 0;
5615     }
5616     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5617               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5618          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5619               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5620        ) {
5621         if (floatx80_is_signaling_nan(a, status)
5622          || floatx80_is_signaling_nan(b, status)) {
5623             float_raise(float_flag_invalid, status);
5624         }
5625         return 0;
5626     }
5627     return
5628            ( a.low == b.low )
5629         && (    ( a.high == b.high )
5630              || (    ( a.low == 0 )
5631                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5632            );
5633 
5634 }
5635 
5636 /*----------------------------------------------------------------------------
5637 | Returns 1 if the extended double-precision floating-point value `a' is less
5638 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5639 | do not cause an exception.  Otherwise, the comparison is performed according
5640 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5641 *----------------------------------------------------------------------------*/
5642 
5643 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5644 {
5645     flag aSign, bSign;
5646 
5647     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5648         float_raise(float_flag_invalid, status);
5649         return 0;
5650     }
5651     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5652               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5653          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5654               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5655        ) {
5656         if (floatx80_is_signaling_nan(a, status)
5657          || floatx80_is_signaling_nan(b, status)) {
5658             float_raise(float_flag_invalid, status);
5659         }
5660         return 0;
5661     }
5662     aSign = extractFloatx80Sign( a );
5663     bSign = extractFloatx80Sign( b );
5664     if ( aSign != bSign ) {
5665         return
5666                aSign
5667             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5668                  == 0 );
5669     }
5670     return
5671           aSign ? le128( b.high, b.low, a.high, a.low )
5672         : le128( a.high, a.low, b.high, b.low );
5673 
5674 }
5675 
5676 /*----------------------------------------------------------------------------
5677 | Returns 1 if the extended double-precision floating-point value `a' is less
5678 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5679 | an exception.  Otherwise, the comparison is performed according to the
5680 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5681 *----------------------------------------------------------------------------*/
5682 
5683 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5684 {
5685     flag aSign, bSign;
5686 
5687     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5688         float_raise(float_flag_invalid, status);
5689         return 0;
5690     }
5691     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5692               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5693          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5694               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5695        ) {
5696         if (floatx80_is_signaling_nan(a, status)
5697          || floatx80_is_signaling_nan(b, status)) {
5698             float_raise(float_flag_invalid, status);
5699         }
5700         return 0;
5701     }
5702     aSign = extractFloatx80Sign( a );
5703     bSign = extractFloatx80Sign( b );
5704     if ( aSign != bSign ) {
5705         return
5706                aSign
5707             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5708                  != 0 );
5709     }
5710     return
5711           aSign ? lt128( b.high, b.low, a.high, a.low )
5712         : lt128( a.high, a.low, b.high, b.low );
5713 
5714 }
5715 
5716 /*----------------------------------------------------------------------------
5717 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5718 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5719 | The comparison is performed according to the IEC/IEEE Standard for Binary
5720 | Floating-Point Arithmetic.
5721 *----------------------------------------------------------------------------*/
5722 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5723 {
5724     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5725         float_raise(float_flag_invalid, status);
5726         return 1;
5727     }
5728     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5729               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5730          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5731               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5732        ) {
5733         if (floatx80_is_signaling_nan(a, status)
5734          || floatx80_is_signaling_nan(b, status)) {
5735             float_raise(float_flag_invalid, status);
5736         }
5737         return 1;
5738     }
5739     return 0;
5740 }
5741 
5742 /*----------------------------------------------------------------------------
5743 | Returns the result of converting the quadruple-precision floating-point
5744 | value `a' to the 32-bit two's complement integer format.  The conversion
5745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5746 | Arithmetic---which means in particular that the conversion is rounded
5747 | according to the current rounding mode.  If `a' is a NaN, the largest
5748 | positive integer is returned.  Otherwise, if the conversion overflows, the
5749 | largest integer with the same sign as `a' is returned.
5750 *----------------------------------------------------------------------------*/
5751 
5752 int32_t float128_to_int32(float128 a, float_status *status)
5753 {
5754     flag aSign;
5755     int32_t aExp, shiftCount;
5756     uint64_t aSig0, aSig1;
5757 
5758     aSig1 = extractFloat128Frac1( a );
5759     aSig0 = extractFloat128Frac0( a );
5760     aExp = extractFloat128Exp( a );
5761     aSign = extractFloat128Sign( a );
5762     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5763     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5764     aSig0 |= ( aSig1 != 0 );
5765     shiftCount = 0x4028 - aExp;
5766     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5767     return roundAndPackInt32(aSign, aSig0, status);
5768 
5769 }
5770 
5771 /*----------------------------------------------------------------------------
5772 | Returns the result of converting the quadruple-precision floating-point
5773 | value `a' to the 32-bit two's complement integer format.  The conversion
5774 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5775 | Arithmetic, except that the conversion is always rounded toward zero.  If
5776 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5777 | conversion overflows, the largest integer with the same sign as `a' is
5778 | returned.
5779 *----------------------------------------------------------------------------*/
5780 
5781 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5782 {
5783     flag aSign;
5784     int32_t aExp, shiftCount;
5785     uint64_t aSig0, aSig1, savedASig;
5786     int32_t z;
5787 
5788     aSig1 = extractFloat128Frac1( a );
5789     aSig0 = extractFloat128Frac0( a );
5790     aExp = extractFloat128Exp( a );
5791     aSign = extractFloat128Sign( a );
5792     aSig0 |= ( aSig1 != 0 );
5793     if ( 0x401E < aExp ) {
5794         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5795         goto invalid;
5796     }
5797     else if ( aExp < 0x3FFF ) {
5798         if (aExp || aSig0) {
5799             status->float_exception_flags |= float_flag_inexact;
5800         }
5801         return 0;
5802     }
5803     aSig0 |= LIT64( 0x0001000000000000 );
5804     shiftCount = 0x402F - aExp;
5805     savedASig = aSig0;
5806     aSig0 >>= shiftCount;
5807     z = aSig0;
5808     if ( aSign ) z = - z;
5809     if ( ( z < 0 ) ^ aSign ) {
5810  invalid:
5811         float_raise(float_flag_invalid, status);
5812         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5813     }
5814     if ( ( aSig0<<shiftCount ) != savedASig ) {
5815         status->float_exception_flags |= float_flag_inexact;
5816     }
5817     return z;
5818 
5819 }
5820 
5821 /*----------------------------------------------------------------------------
5822 | Returns the result of converting the quadruple-precision floating-point
5823 | value `a' to the 64-bit two's complement integer format.  The conversion
5824 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5825 | Arithmetic---which means in particular that the conversion is rounded
5826 | according to the current rounding mode.  If `a' is a NaN, the largest
5827 | positive integer is returned.  Otherwise, if the conversion overflows, the
5828 | largest integer with the same sign as `a' is returned.
5829 *----------------------------------------------------------------------------*/
5830 
5831 int64_t float128_to_int64(float128 a, float_status *status)
5832 {
5833     flag aSign;
5834     int32_t aExp, shiftCount;
5835     uint64_t aSig0, aSig1;
5836 
5837     aSig1 = extractFloat128Frac1( a );
5838     aSig0 = extractFloat128Frac0( a );
5839     aExp = extractFloat128Exp( a );
5840     aSign = extractFloat128Sign( a );
5841     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5842     shiftCount = 0x402F - aExp;
5843     if ( shiftCount <= 0 ) {
5844         if ( 0x403E < aExp ) {
5845             float_raise(float_flag_invalid, status);
5846             if (    ! aSign
5847                  || (    ( aExp == 0x7FFF )
5848                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5849                     )
5850                ) {
5851                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5852             }
5853             return (int64_t) LIT64( 0x8000000000000000 );
5854         }
5855         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5856     }
5857     else {
5858         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5859     }
5860     return roundAndPackInt64(aSign, aSig0, aSig1, status);
5861 
5862 }
5863 
5864 /*----------------------------------------------------------------------------
5865 | Returns the result of converting the quadruple-precision floating-point
5866 | value `a' to the 64-bit two's complement integer format.  The conversion
5867 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5868 | Arithmetic, except that the conversion is always rounded toward zero.
5869 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5870 | the conversion overflows, the largest integer with the same sign as `a' is
5871 | returned.
5872 *----------------------------------------------------------------------------*/
5873 
5874 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
5875 {
5876     flag aSign;
5877     int32_t aExp, shiftCount;
5878     uint64_t aSig0, aSig1;
5879     int64_t z;
5880 
5881     aSig1 = extractFloat128Frac1( a );
5882     aSig0 = extractFloat128Frac0( a );
5883     aExp = extractFloat128Exp( a );
5884     aSign = extractFloat128Sign( a );
5885     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5886     shiftCount = aExp - 0x402F;
5887     if ( 0 < shiftCount ) {
5888         if ( 0x403E <= aExp ) {
5889             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5890             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5891                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5892                 if (aSig1) {
5893                     status->float_exception_flags |= float_flag_inexact;
5894                 }
5895             }
5896             else {
5897                 float_raise(float_flag_invalid, status);
5898                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5899                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5900                 }
5901             }
5902             return (int64_t) LIT64( 0x8000000000000000 );
5903         }
5904         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5905         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5906             status->float_exception_flags |= float_flag_inexact;
5907         }
5908     }
5909     else {
5910         if ( aExp < 0x3FFF ) {
5911             if ( aExp | aSig0 | aSig1 ) {
5912                 status->float_exception_flags |= float_flag_inexact;
5913             }
5914             return 0;
5915         }
5916         z = aSig0>>( - shiftCount );
5917         if (    aSig1
5918              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5919             status->float_exception_flags |= float_flag_inexact;
5920         }
5921     }
5922     if ( aSign ) z = - z;
5923     return z;
5924 
5925 }
5926 
5927 /*----------------------------------------------------------------------------
5928 | Returns the result of converting the quadruple-precision floating-point value
5929 | `a' to the 64-bit unsigned integer format.  The conversion is
5930 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5931 | Arithmetic---which means in particular that the conversion is rounded
5932 | according to the current rounding mode.  If `a' is a NaN, the largest
5933 | positive integer is returned.  If the conversion overflows, the
5934 | largest unsigned integer is returned.  If 'a' is negative, the value is
5935 | rounded and zero is returned; negative values that do not round to zero
5936 | will raise the inexact exception.
5937 *----------------------------------------------------------------------------*/
5938 
5939 uint64_t float128_to_uint64(float128 a, float_status *status)
5940 {
5941     flag aSign;
5942     int aExp;
5943     int shiftCount;
5944     uint64_t aSig0, aSig1;
5945 
5946     aSig0 = extractFloat128Frac0(a);
5947     aSig1 = extractFloat128Frac1(a);
5948     aExp = extractFloat128Exp(a);
5949     aSign = extractFloat128Sign(a);
5950     if (aSign && (aExp > 0x3FFE)) {
5951         float_raise(float_flag_invalid, status);
5952         if (float128_is_any_nan(a)) {
5953             return LIT64(0xFFFFFFFFFFFFFFFF);
5954         } else {
5955             return 0;
5956         }
5957     }
5958     if (aExp) {
5959         aSig0 |= LIT64(0x0001000000000000);
5960     }
5961     shiftCount = 0x402F - aExp;
5962     if (shiftCount <= 0) {
5963         if (0x403E < aExp) {
5964             float_raise(float_flag_invalid, status);
5965             return LIT64(0xFFFFFFFFFFFFFFFF);
5966         }
5967         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
5968     } else {
5969         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
5970     }
5971     return roundAndPackUint64(aSign, aSig0, aSig1, status);
5972 }
5973 
5974 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
5975 {
5976     uint64_t v;
5977     signed char current_rounding_mode = status->float_rounding_mode;
5978 
5979     set_float_rounding_mode(float_round_to_zero, status);
5980     v = float128_to_uint64(a, status);
5981     set_float_rounding_mode(current_rounding_mode, status);
5982 
5983     return v;
5984 }
5985 
5986 /*----------------------------------------------------------------------------
5987 | Returns the result of converting the quadruple-precision floating-point
5988 | value `a' to the 32-bit unsigned integer format.  The conversion
5989 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5990 | Arithmetic except that the conversion is always rounded toward zero.
5991 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
5992 | if the conversion overflows, the largest unsigned integer is returned.
5993 | If 'a' is negative, the value is rounded and zero is returned; negative
5994 | values that do not round to zero will raise the inexact exception.
5995 *----------------------------------------------------------------------------*/
5996 
5997 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
5998 {
5999     uint64_t v;
6000     uint32_t res;
6001     int old_exc_flags = get_float_exception_flags(status);
6002 
6003     v = float128_to_uint64_round_to_zero(a, status);
6004     if (v > 0xffffffff) {
6005         res = 0xffffffff;
6006     } else {
6007         return v;
6008     }
6009     set_float_exception_flags(old_exc_flags, status);
6010     float_raise(float_flag_invalid, status);
6011     return res;
6012 }
6013 
6014 /*----------------------------------------------------------------------------
6015 | Returns the result of converting the quadruple-precision floating-point
6016 | value `a' to the single-precision floating-point format.  The conversion
6017 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6018 | Arithmetic.
6019 *----------------------------------------------------------------------------*/
6020 
6021 float32 float128_to_float32(float128 a, float_status *status)
6022 {
6023     flag aSign;
6024     int32_t aExp;
6025     uint64_t aSig0, aSig1;
6026     uint32_t zSig;
6027 
6028     aSig1 = extractFloat128Frac1( a );
6029     aSig0 = extractFloat128Frac0( a );
6030     aExp = extractFloat128Exp( a );
6031     aSign = extractFloat128Sign( a );
6032     if ( aExp == 0x7FFF ) {
6033         if ( aSig0 | aSig1 ) {
6034             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6035         }
6036         return packFloat32( aSign, 0xFF, 0 );
6037     }
6038     aSig0 |= ( aSig1 != 0 );
6039     shift64RightJamming( aSig0, 18, &aSig0 );
6040     zSig = aSig0;
6041     if ( aExp || zSig ) {
6042         zSig |= 0x40000000;
6043         aExp -= 0x3F81;
6044     }
6045     return roundAndPackFloat32(aSign, aExp, zSig, status);
6046 
6047 }
6048 
6049 /*----------------------------------------------------------------------------
6050 | Returns the result of converting the quadruple-precision floating-point
6051 | value `a' to the double-precision floating-point format.  The conversion
6052 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6053 | Arithmetic.
6054 *----------------------------------------------------------------------------*/
6055 
6056 float64 float128_to_float64(float128 a, float_status *status)
6057 {
6058     flag aSign;
6059     int32_t aExp;
6060     uint64_t aSig0, aSig1;
6061 
6062     aSig1 = extractFloat128Frac1( a );
6063     aSig0 = extractFloat128Frac0( a );
6064     aExp = extractFloat128Exp( a );
6065     aSign = extractFloat128Sign( a );
6066     if ( aExp == 0x7FFF ) {
6067         if ( aSig0 | aSig1 ) {
6068             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6069         }
6070         return packFloat64( aSign, 0x7FF, 0 );
6071     }
6072     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6073     aSig0 |= ( aSig1 != 0 );
6074     if ( aExp || aSig0 ) {
6075         aSig0 |= LIT64( 0x4000000000000000 );
6076         aExp -= 0x3C01;
6077     }
6078     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6079 
6080 }
6081 
6082 /*----------------------------------------------------------------------------
6083 | Returns the result of converting the quadruple-precision floating-point
6084 | value `a' to the extended double-precision floating-point format.  The
6085 | conversion is performed according to the IEC/IEEE Standard for Binary
6086 | Floating-Point Arithmetic.
6087 *----------------------------------------------------------------------------*/
6088 
6089 floatx80 float128_to_floatx80(float128 a, float_status *status)
6090 {
6091     flag aSign;
6092     int32_t aExp;
6093     uint64_t aSig0, aSig1;
6094 
6095     aSig1 = extractFloat128Frac1( a );
6096     aSig0 = extractFloat128Frac0( a );
6097     aExp = extractFloat128Exp( a );
6098     aSign = extractFloat128Sign( a );
6099     if ( aExp == 0x7FFF ) {
6100         if ( aSig0 | aSig1 ) {
6101             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6102         }
6103         return packFloatx80(aSign, floatx80_infinity_high,
6104                                    floatx80_infinity_low);
6105     }
6106     if ( aExp == 0 ) {
6107         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6108         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6109     }
6110     else {
6111         aSig0 |= LIT64( 0x0001000000000000 );
6112     }
6113     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6114     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6115 
6116 }
6117 
6118 /*----------------------------------------------------------------------------
6119 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6120 | returns the result as a quadruple-precision floating-point value.  The
6121 | operation is performed according to the IEC/IEEE Standard for Binary
6122 | Floating-Point Arithmetic.
6123 *----------------------------------------------------------------------------*/
6124 
6125 float128 float128_round_to_int(float128 a, float_status *status)
6126 {
6127     flag aSign;
6128     int32_t aExp;
6129     uint64_t lastBitMask, roundBitsMask;
6130     float128 z;
6131 
6132     aExp = extractFloat128Exp( a );
6133     if ( 0x402F <= aExp ) {
6134         if ( 0x406F <= aExp ) {
6135             if (    ( aExp == 0x7FFF )
6136                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6137                ) {
6138                 return propagateFloat128NaN(a, a, status);
6139             }
6140             return a;
6141         }
6142         lastBitMask = 1;
6143         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6144         roundBitsMask = lastBitMask - 1;
6145         z = a;
6146         switch (status->float_rounding_mode) {
6147         case float_round_nearest_even:
6148             if ( lastBitMask ) {
6149                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6150                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6151             }
6152             else {
6153                 if ( (int64_t) z.low < 0 ) {
6154                     ++z.high;
6155                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6156                 }
6157             }
6158             break;
6159         case float_round_ties_away:
6160             if (lastBitMask) {
6161                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6162             } else {
6163                 if ((int64_t) z.low < 0) {
6164                     ++z.high;
6165                 }
6166             }
6167             break;
6168         case float_round_to_zero:
6169             break;
6170         case float_round_up:
6171             if (!extractFloat128Sign(z)) {
6172                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6173             }
6174             break;
6175         case float_round_down:
6176             if (extractFloat128Sign(z)) {
6177                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6178             }
6179             break;
6180         default:
6181             abort();
6182         }
6183         z.low &= ~ roundBitsMask;
6184     }
6185     else {
6186         if ( aExp < 0x3FFF ) {
6187             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6188             status->float_exception_flags |= float_flag_inexact;
6189             aSign = extractFloat128Sign( a );
6190             switch (status->float_rounding_mode) {
6191              case float_round_nearest_even:
6192                 if (    ( aExp == 0x3FFE )
6193                      && (   extractFloat128Frac0( a )
6194                           | extractFloat128Frac1( a ) )
6195                    ) {
6196                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6197                 }
6198                 break;
6199             case float_round_ties_away:
6200                 if (aExp == 0x3FFE) {
6201                     return packFloat128(aSign, 0x3FFF, 0, 0);
6202                 }
6203                 break;
6204              case float_round_down:
6205                 return
6206                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6207                     : packFloat128( 0, 0, 0, 0 );
6208              case float_round_up:
6209                 return
6210                       aSign ? packFloat128( 1, 0, 0, 0 )
6211                     : packFloat128( 0, 0x3FFF, 0, 0 );
6212             }
6213             return packFloat128( aSign, 0, 0, 0 );
6214         }
6215         lastBitMask = 1;
6216         lastBitMask <<= 0x402F - aExp;
6217         roundBitsMask = lastBitMask - 1;
6218         z.low = 0;
6219         z.high = a.high;
6220         switch (status->float_rounding_mode) {
6221         case float_round_nearest_even:
6222             z.high += lastBitMask>>1;
6223             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6224                 z.high &= ~ lastBitMask;
6225             }
6226             break;
6227         case float_round_ties_away:
6228             z.high += lastBitMask>>1;
6229             break;
6230         case float_round_to_zero:
6231             break;
6232         case float_round_up:
6233             if (!extractFloat128Sign(z)) {
6234                 z.high |= ( a.low != 0 );
6235                 z.high += roundBitsMask;
6236             }
6237             break;
6238         case float_round_down:
6239             if (extractFloat128Sign(z)) {
6240                 z.high |= (a.low != 0);
6241                 z.high += roundBitsMask;
6242             }
6243             break;
6244         default:
6245             abort();
6246         }
6247         z.high &= ~ roundBitsMask;
6248     }
6249     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6250         status->float_exception_flags |= float_flag_inexact;
6251     }
6252     return z;
6253 
6254 }
6255 
6256 /*----------------------------------------------------------------------------
6257 | Returns the result of adding the absolute values of the quadruple-precision
6258 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6259 | before being returned.  `zSign' is ignored if the result is a NaN.
6260 | The addition is performed according to the IEC/IEEE Standard for Binary
6261 | Floating-Point Arithmetic.
6262 *----------------------------------------------------------------------------*/
6263 
6264 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6265                                 float_status *status)
6266 {
6267     int32_t aExp, bExp, zExp;
6268     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6269     int32_t expDiff;
6270 
6271     aSig1 = extractFloat128Frac1( a );
6272     aSig0 = extractFloat128Frac0( a );
6273     aExp = extractFloat128Exp( a );
6274     bSig1 = extractFloat128Frac1( b );
6275     bSig0 = extractFloat128Frac0( b );
6276     bExp = extractFloat128Exp( b );
6277     expDiff = aExp - bExp;
6278     if ( 0 < expDiff ) {
6279         if ( aExp == 0x7FFF ) {
6280             if (aSig0 | aSig1) {
6281                 return propagateFloat128NaN(a, b, status);
6282             }
6283             return a;
6284         }
6285         if ( bExp == 0 ) {
6286             --expDiff;
6287         }
6288         else {
6289             bSig0 |= LIT64( 0x0001000000000000 );
6290         }
6291         shift128ExtraRightJamming(
6292             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6293         zExp = aExp;
6294     }
6295     else if ( expDiff < 0 ) {
6296         if ( bExp == 0x7FFF ) {
6297             if (bSig0 | bSig1) {
6298                 return propagateFloat128NaN(a, b, status);
6299             }
6300             return packFloat128( zSign, 0x7FFF, 0, 0 );
6301         }
6302         if ( aExp == 0 ) {
6303             ++expDiff;
6304         }
6305         else {
6306             aSig0 |= LIT64( 0x0001000000000000 );
6307         }
6308         shift128ExtraRightJamming(
6309             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6310         zExp = bExp;
6311     }
6312     else {
6313         if ( aExp == 0x7FFF ) {
6314             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6315                 return propagateFloat128NaN(a, b, status);
6316             }
6317             return a;
6318         }
6319         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6320         if ( aExp == 0 ) {
6321             if (status->flush_to_zero) {
6322                 if (zSig0 | zSig1) {
6323                     float_raise(float_flag_output_denormal, status);
6324                 }
6325                 return packFloat128(zSign, 0, 0, 0);
6326             }
6327             return packFloat128( zSign, 0, zSig0, zSig1 );
6328         }
6329         zSig2 = 0;
6330         zSig0 |= LIT64( 0x0002000000000000 );
6331         zExp = aExp;
6332         goto shiftRight1;
6333     }
6334     aSig0 |= LIT64( 0x0001000000000000 );
6335     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6336     --zExp;
6337     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6338     ++zExp;
6339  shiftRight1:
6340     shift128ExtraRightJamming(
6341         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6342  roundAndPack:
6343     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6344 
6345 }
6346 
6347 /*----------------------------------------------------------------------------
6348 | Returns the result of subtracting the absolute values of the quadruple-
6349 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6350 | difference is negated before being returned.  `zSign' is ignored if the
6351 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6352 | Standard for Binary Floating-Point Arithmetic.
6353 *----------------------------------------------------------------------------*/
6354 
6355 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6356                                 float_status *status)
6357 {
6358     int32_t aExp, bExp, zExp;
6359     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6360     int32_t expDiff;
6361 
6362     aSig1 = extractFloat128Frac1( a );
6363     aSig0 = extractFloat128Frac0( a );
6364     aExp = extractFloat128Exp( a );
6365     bSig1 = extractFloat128Frac1( b );
6366     bSig0 = extractFloat128Frac0( b );
6367     bExp = extractFloat128Exp( b );
6368     expDiff = aExp - bExp;
6369     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6370     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6371     if ( 0 < expDiff ) goto aExpBigger;
6372     if ( expDiff < 0 ) goto bExpBigger;
6373     if ( aExp == 0x7FFF ) {
6374         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6375             return propagateFloat128NaN(a, b, status);
6376         }
6377         float_raise(float_flag_invalid, status);
6378         return float128_default_nan(status);
6379     }
6380     if ( aExp == 0 ) {
6381         aExp = 1;
6382         bExp = 1;
6383     }
6384     if ( bSig0 < aSig0 ) goto aBigger;
6385     if ( aSig0 < bSig0 ) goto bBigger;
6386     if ( bSig1 < aSig1 ) goto aBigger;
6387     if ( aSig1 < bSig1 ) goto bBigger;
6388     return packFloat128(status->float_rounding_mode == float_round_down,
6389                         0, 0, 0);
6390  bExpBigger:
6391     if ( bExp == 0x7FFF ) {
6392         if (bSig0 | bSig1) {
6393             return propagateFloat128NaN(a, b, status);
6394         }
6395         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6396     }
6397     if ( aExp == 0 ) {
6398         ++expDiff;
6399     }
6400     else {
6401         aSig0 |= LIT64( 0x4000000000000000 );
6402     }
6403     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6404     bSig0 |= LIT64( 0x4000000000000000 );
6405  bBigger:
6406     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6407     zExp = bExp;
6408     zSign ^= 1;
6409     goto normalizeRoundAndPack;
6410  aExpBigger:
6411     if ( aExp == 0x7FFF ) {
6412         if (aSig0 | aSig1) {
6413             return propagateFloat128NaN(a, b, status);
6414         }
6415         return a;
6416     }
6417     if ( bExp == 0 ) {
6418         --expDiff;
6419     }
6420     else {
6421         bSig0 |= LIT64( 0x4000000000000000 );
6422     }
6423     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6424     aSig0 |= LIT64( 0x4000000000000000 );
6425  aBigger:
6426     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6427     zExp = aExp;
6428  normalizeRoundAndPack:
6429     --zExp;
6430     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6431                                          status);
6432 
6433 }
6434 
6435 /*----------------------------------------------------------------------------
6436 | Returns the result of adding the quadruple-precision floating-point values
6437 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6438 | for Binary Floating-Point Arithmetic.
6439 *----------------------------------------------------------------------------*/
6440 
6441 float128 float128_add(float128 a, float128 b, float_status *status)
6442 {
6443     flag aSign, bSign;
6444 
6445     aSign = extractFloat128Sign( a );
6446     bSign = extractFloat128Sign( b );
6447     if ( aSign == bSign ) {
6448         return addFloat128Sigs(a, b, aSign, status);
6449     }
6450     else {
6451         return subFloat128Sigs(a, b, aSign, status);
6452     }
6453 
6454 }
6455 
6456 /*----------------------------------------------------------------------------
6457 | Returns the result of subtracting the quadruple-precision floating-point
6458 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6459 | Standard for Binary Floating-Point Arithmetic.
6460 *----------------------------------------------------------------------------*/
6461 
6462 float128 float128_sub(float128 a, float128 b, float_status *status)
6463 {
6464     flag aSign, bSign;
6465 
6466     aSign = extractFloat128Sign( a );
6467     bSign = extractFloat128Sign( b );
6468     if ( aSign == bSign ) {
6469         return subFloat128Sigs(a, b, aSign, status);
6470     }
6471     else {
6472         return addFloat128Sigs(a, b, aSign, status);
6473     }
6474 
6475 }
6476 
6477 /*----------------------------------------------------------------------------
6478 | Returns the result of multiplying the quadruple-precision floating-point
6479 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6480 | Standard for Binary Floating-Point Arithmetic.
6481 *----------------------------------------------------------------------------*/
6482 
6483 float128 float128_mul(float128 a, float128 b, float_status *status)
6484 {
6485     flag aSign, bSign, zSign;
6486     int32_t aExp, bExp, zExp;
6487     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6488 
6489     aSig1 = extractFloat128Frac1( a );
6490     aSig0 = extractFloat128Frac0( a );
6491     aExp = extractFloat128Exp( a );
6492     aSign = extractFloat128Sign( a );
6493     bSig1 = extractFloat128Frac1( b );
6494     bSig0 = extractFloat128Frac0( b );
6495     bExp = extractFloat128Exp( b );
6496     bSign = extractFloat128Sign( b );
6497     zSign = aSign ^ bSign;
6498     if ( aExp == 0x7FFF ) {
6499         if (    ( aSig0 | aSig1 )
6500              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6501             return propagateFloat128NaN(a, b, status);
6502         }
6503         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6504         return packFloat128( zSign, 0x7FFF, 0, 0 );
6505     }
6506     if ( bExp == 0x7FFF ) {
6507         if (bSig0 | bSig1) {
6508             return propagateFloat128NaN(a, b, status);
6509         }
6510         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6511  invalid:
6512             float_raise(float_flag_invalid, status);
6513             return float128_default_nan(status);
6514         }
6515         return packFloat128( zSign, 0x7FFF, 0, 0 );
6516     }
6517     if ( aExp == 0 ) {
6518         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6519         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6520     }
6521     if ( bExp == 0 ) {
6522         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6523         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6524     }
6525     zExp = aExp + bExp - 0x4000;
6526     aSig0 |= LIT64( 0x0001000000000000 );
6527     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6528     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6529     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6530     zSig2 |= ( zSig3 != 0 );
6531     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6532         shift128ExtraRightJamming(
6533             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6534         ++zExp;
6535     }
6536     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6537 
6538 }
6539 
6540 /*----------------------------------------------------------------------------
6541 | Returns the result of dividing the quadruple-precision floating-point value
6542 | `a' by the corresponding value `b'.  The operation is performed according to
6543 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6544 *----------------------------------------------------------------------------*/
6545 
6546 float128 float128_div(float128 a, float128 b, float_status *status)
6547 {
6548     flag aSign, bSign, zSign;
6549     int32_t aExp, bExp, zExp;
6550     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6551     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6552 
6553     aSig1 = extractFloat128Frac1( a );
6554     aSig0 = extractFloat128Frac0( a );
6555     aExp = extractFloat128Exp( a );
6556     aSign = extractFloat128Sign( a );
6557     bSig1 = extractFloat128Frac1( b );
6558     bSig0 = extractFloat128Frac0( b );
6559     bExp = extractFloat128Exp( b );
6560     bSign = extractFloat128Sign( b );
6561     zSign = aSign ^ bSign;
6562     if ( aExp == 0x7FFF ) {
6563         if (aSig0 | aSig1) {
6564             return propagateFloat128NaN(a, b, status);
6565         }
6566         if ( bExp == 0x7FFF ) {
6567             if (bSig0 | bSig1) {
6568                 return propagateFloat128NaN(a, b, status);
6569             }
6570             goto invalid;
6571         }
6572         return packFloat128( zSign, 0x7FFF, 0, 0 );
6573     }
6574     if ( bExp == 0x7FFF ) {
6575         if (bSig0 | bSig1) {
6576             return propagateFloat128NaN(a, b, status);
6577         }
6578         return packFloat128( zSign, 0, 0, 0 );
6579     }
6580     if ( bExp == 0 ) {
6581         if ( ( bSig0 | bSig1 ) == 0 ) {
6582             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6583  invalid:
6584                 float_raise(float_flag_invalid, status);
6585                 return float128_default_nan(status);
6586             }
6587             float_raise(float_flag_divbyzero, status);
6588             return packFloat128( zSign, 0x7FFF, 0, 0 );
6589         }
6590         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6591     }
6592     if ( aExp == 0 ) {
6593         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6594         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6595     }
6596     zExp = aExp - bExp + 0x3FFD;
6597     shortShift128Left(
6598         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6599     shortShift128Left(
6600         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6601     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6602         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6603         ++zExp;
6604     }
6605     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6606     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6607     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6608     while ( (int64_t) rem0 < 0 ) {
6609         --zSig0;
6610         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6611     }
6612     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6613     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6614         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6615         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6616         while ( (int64_t) rem1 < 0 ) {
6617             --zSig1;
6618             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6619         }
6620         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6621     }
6622     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6623     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6624 
6625 }
6626 
6627 /*----------------------------------------------------------------------------
6628 | Returns the remainder of the quadruple-precision floating-point value `a'
6629 | with respect to the corresponding value `b'.  The operation is performed
6630 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6631 *----------------------------------------------------------------------------*/
6632 
6633 float128 float128_rem(float128 a, float128 b, float_status *status)
6634 {
6635     flag aSign, zSign;
6636     int32_t aExp, bExp, expDiff;
6637     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6638     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6639     int64_t sigMean0;
6640 
6641     aSig1 = extractFloat128Frac1( a );
6642     aSig0 = extractFloat128Frac0( a );
6643     aExp = extractFloat128Exp( a );
6644     aSign = extractFloat128Sign( a );
6645     bSig1 = extractFloat128Frac1( b );
6646     bSig0 = extractFloat128Frac0( b );
6647     bExp = extractFloat128Exp( b );
6648     if ( aExp == 0x7FFF ) {
6649         if (    ( aSig0 | aSig1 )
6650              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6651             return propagateFloat128NaN(a, b, status);
6652         }
6653         goto invalid;
6654     }
6655     if ( bExp == 0x7FFF ) {
6656         if (bSig0 | bSig1) {
6657             return propagateFloat128NaN(a, b, status);
6658         }
6659         return a;
6660     }
6661     if ( bExp == 0 ) {
6662         if ( ( bSig0 | bSig1 ) == 0 ) {
6663  invalid:
6664             float_raise(float_flag_invalid, status);
6665             return float128_default_nan(status);
6666         }
6667         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6668     }
6669     if ( aExp == 0 ) {
6670         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6671         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6672     }
6673     expDiff = aExp - bExp;
6674     if ( expDiff < -1 ) return a;
6675     shortShift128Left(
6676         aSig0 | LIT64( 0x0001000000000000 ),
6677         aSig1,
6678         15 - ( expDiff < 0 ),
6679         &aSig0,
6680         &aSig1
6681     );
6682     shortShift128Left(
6683         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6684     q = le128( bSig0, bSig1, aSig0, aSig1 );
6685     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6686     expDiff -= 64;
6687     while ( 0 < expDiff ) {
6688         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6689         q = ( 4 < q ) ? q - 4 : 0;
6690         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6691         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6692         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6693         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6694         expDiff -= 61;
6695     }
6696     if ( -64 < expDiff ) {
6697         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6698         q = ( 4 < q ) ? q - 4 : 0;
6699         q >>= - expDiff;
6700         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6701         expDiff += 52;
6702         if ( expDiff < 0 ) {
6703             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6704         }
6705         else {
6706             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6707         }
6708         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6709         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6710     }
6711     else {
6712         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6713         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6714     }
6715     do {
6716         alternateASig0 = aSig0;
6717         alternateASig1 = aSig1;
6718         ++q;
6719         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6720     } while ( 0 <= (int64_t) aSig0 );
6721     add128(
6722         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6723     if (    ( sigMean0 < 0 )
6724          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6725         aSig0 = alternateASig0;
6726         aSig1 = alternateASig1;
6727     }
6728     zSign = ( (int64_t) aSig0 < 0 );
6729     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6730     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6731                                          status);
6732 }
6733 
6734 /*----------------------------------------------------------------------------
6735 | Returns the square root of the quadruple-precision floating-point value `a'.
6736 | The operation is performed according to the IEC/IEEE Standard for Binary
6737 | Floating-Point Arithmetic.
6738 *----------------------------------------------------------------------------*/
6739 
6740 float128 float128_sqrt(float128 a, float_status *status)
6741 {
6742     flag aSign;
6743     int32_t aExp, zExp;
6744     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6745     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6746 
6747     aSig1 = extractFloat128Frac1( a );
6748     aSig0 = extractFloat128Frac0( a );
6749     aExp = extractFloat128Exp( a );
6750     aSign = extractFloat128Sign( a );
6751     if ( aExp == 0x7FFF ) {
6752         if (aSig0 | aSig1) {
6753             return propagateFloat128NaN(a, a, status);
6754         }
6755         if ( ! aSign ) return a;
6756         goto invalid;
6757     }
6758     if ( aSign ) {
6759         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6760  invalid:
6761         float_raise(float_flag_invalid, status);
6762         return float128_default_nan(status);
6763     }
6764     if ( aExp == 0 ) {
6765         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6766         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6767     }
6768     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6769     aSig0 |= LIT64( 0x0001000000000000 );
6770     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6771     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6772     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6773     doubleZSig0 = zSig0<<1;
6774     mul64To128( zSig0, zSig0, &term0, &term1 );
6775     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6776     while ( (int64_t) rem0 < 0 ) {
6777         --zSig0;
6778         doubleZSig0 -= 2;
6779         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6780     }
6781     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6782     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6783         if ( zSig1 == 0 ) zSig1 = 1;
6784         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6785         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6786         mul64To128( zSig1, zSig1, &term2, &term3 );
6787         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6788         while ( (int64_t) rem1 < 0 ) {
6789             --zSig1;
6790             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6791             term3 |= 1;
6792             term2 |= doubleZSig0;
6793             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6794         }
6795         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6796     }
6797     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6798     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6799 
6800 }
6801 
6802 /*----------------------------------------------------------------------------
6803 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6804 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6805 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6806 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6807 *----------------------------------------------------------------------------*/
6808 
6809 int float128_eq(float128 a, float128 b, float_status *status)
6810 {
6811 
6812     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6813               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6814          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6815               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6816        ) {
6817         float_raise(float_flag_invalid, status);
6818         return 0;
6819     }
6820     return
6821            ( a.low == b.low )
6822         && (    ( a.high == b.high )
6823              || (    ( a.low == 0 )
6824                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6825            );
6826 
6827 }
6828 
6829 /*----------------------------------------------------------------------------
6830 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6831 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6832 | exception is raised if either operand is a NaN.  The comparison is performed
6833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6834 *----------------------------------------------------------------------------*/
6835 
6836 int float128_le(float128 a, float128 b, float_status *status)
6837 {
6838     flag aSign, bSign;
6839 
6840     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6841               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6842          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6843               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6844        ) {
6845         float_raise(float_flag_invalid, status);
6846         return 0;
6847     }
6848     aSign = extractFloat128Sign( a );
6849     bSign = extractFloat128Sign( b );
6850     if ( aSign != bSign ) {
6851         return
6852                aSign
6853             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6854                  == 0 );
6855     }
6856     return
6857           aSign ? le128( b.high, b.low, a.high, a.low )
6858         : le128( a.high, a.low, b.high, b.low );
6859 
6860 }
6861 
6862 /*----------------------------------------------------------------------------
6863 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6864 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6865 | raised if either operand is a NaN.  The comparison is performed according
6866 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6867 *----------------------------------------------------------------------------*/
6868 
6869 int float128_lt(float128 a, float128 b, float_status *status)
6870 {
6871     flag aSign, bSign;
6872 
6873     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6874               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6875          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6876               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6877        ) {
6878         float_raise(float_flag_invalid, status);
6879         return 0;
6880     }
6881     aSign = extractFloat128Sign( a );
6882     bSign = extractFloat128Sign( b );
6883     if ( aSign != bSign ) {
6884         return
6885                aSign
6886             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6887                  != 0 );
6888     }
6889     return
6890           aSign ? lt128( b.high, b.low, a.high, a.low )
6891         : lt128( a.high, a.low, b.high, b.low );
6892 
6893 }
6894 
6895 /*----------------------------------------------------------------------------
6896 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6897 | be compared, and 0 otherwise.  The invalid exception is raised if either
6898 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6899 | Standard for Binary Floating-Point Arithmetic.
6900 *----------------------------------------------------------------------------*/
6901 
6902 int float128_unordered(float128 a, float128 b, float_status *status)
6903 {
6904     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6905               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6906          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6907               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6908        ) {
6909         float_raise(float_flag_invalid, status);
6910         return 1;
6911     }
6912     return 0;
6913 }
6914 
6915 /*----------------------------------------------------------------------------
6916 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6917 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6918 | exception.  The comparison is performed according to the IEC/IEEE Standard
6919 | for Binary Floating-Point Arithmetic.
6920 *----------------------------------------------------------------------------*/
6921 
6922 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6923 {
6924 
6925     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6926               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6927          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6928               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6929        ) {
6930         if (float128_is_signaling_nan(a, status)
6931          || float128_is_signaling_nan(b, status)) {
6932             float_raise(float_flag_invalid, status);
6933         }
6934         return 0;
6935     }
6936     return
6937            ( a.low == b.low )
6938         && (    ( a.high == b.high )
6939              || (    ( a.low == 0 )
6940                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6941            );
6942 
6943 }
6944 
6945 /*----------------------------------------------------------------------------
6946 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6947 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6948 | cause an exception.  Otherwise, the comparison is performed according to the
6949 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6950 *----------------------------------------------------------------------------*/
6951 
6952 int float128_le_quiet(float128 a, float128 b, float_status *status)
6953 {
6954     flag aSign, bSign;
6955 
6956     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6957               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6958          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6959               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6960        ) {
6961         if (float128_is_signaling_nan(a, status)
6962          || float128_is_signaling_nan(b, status)) {
6963             float_raise(float_flag_invalid, status);
6964         }
6965         return 0;
6966     }
6967     aSign = extractFloat128Sign( a );
6968     bSign = extractFloat128Sign( b );
6969     if ( aSign != bSign ) {
6970         return
6971                aSign
6972             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6973                  == 0 );
6974     }
6975     return
6976           aSign ? le128( b.high, b.low, a.high, a.low )
6977         : le128( a.high, a.low, b.high, b.low );
6978 
6979 }
6980 
6981 /*----------------------------------------------------------------------------
6982 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6983 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6984 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6985 | Standard for Binary Floating-Point Arithmetic.
6986 *----------------------------------------------------------------------------*/
6987 
6988 int float128_lt_quiet(float128 a, float128 b, float_status *status)
6989 {
6990     flag aSign, bSign;
6991 
6992     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6993               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6994          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6995               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6996        ) {
6997         if (float128_is_signaling_nan(a, status)
6998          || float128_is_signaling_nan(b, status)) {
6999             float_raise(float_flag_invalid, status);
7000         }
7001         return 0;
7002     }
7003     aSign = extractFloat128Sign( a );
7004     bSign = extractFloat128Sign( b );
7005     if ( aSign != bSign ) {
7006         return
7007                aSign
7008             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7009                  != 0 );
7010     }
7011     return
7012           aSign ? lt128( b.high, b.low, a.high, a.low )
7013         : lt128( a.high, a.low, b.high, b.low );
7014 
7015 }
7016 
7017 /*----------------------------------------------------------------------------
7018 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7019 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7020 | comparison is performed according to the IEC/IEEE Standard for Binary
7021 | Floating-Point Arithmetic.
7022 *----------------------------------------------------------------------------*/
7023 
7024 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7025 {
7026     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7027               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7028          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7029               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7030        ) {
7031         if (float128_is_signaling_nan(a, status)
7032          || float128_is_signaling_nan(b, status)) {
7033             float_raise(float_flag_invalid, status);
7034         }
7035         return 1;
7036     }
7037     return 0;
7038 }
7039 
7040 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7041                                             int is_quiet, float_status *status)
7042 {
7043     flag aSign, bSign;
7044 
7045     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7046         float_raise(float_flag_invalid, status);
7047         return float_relation_unordered;
7048     }
7049     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7050           ( extractFloatx80Frac( a )<<1 ) ) ||
7051         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7052           ( extractFloatx80Frac( b )<<1 ) )) {
7053         if (!is_quiet ||
7054             floatx80_is_signaling_nan(a, status) ||
7055             floatx80_is_signaling_nan(b, status)) {
7056             float_raise(float_flag_invalid, status);
7057         }
7058         return float_relation_unordered;
7059     }
7060     aSign = extractFloatx80Sign( a );
7061     bSign = extractFloatx80Sign( b );
7062     if ( aSign != bSign ) {
7063 
7064         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7065              ( ( a.low | b.low ) == 0 ) ) {
7066             /* zero case */
7067             return float_relation_equal;
7068         } else {
7069             return 1 - (2 * aSign);
7070         }
7071     } else {
7072         if (a.low == b.low && a.high == b.high) {
7073             return float_relation_equal;
7074         } else {
7075             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7076         }
7077     }
7078 }
7079 
7080 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7081 {
7082     return floatx80_compare_internal(a, b, 0, status);
7083 }
7084 
7085 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7086 {
7087     return floatx80_compare_internal(a, b, 1, status);
7088 }
7089 
7090 static inline int float128_compare_internal(float128 a, float128 b,
7091                                             int is_quiet, float_status *status)
7092 {
7093     flag aSign, bSign;
7094 
7095     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7096           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7097         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7098           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7099         if (!is_quiet ||
7100             float128_is_signaling_nan(a, status) ||
7101             float128_is_signaling_nan(b, status)) {
7102             float_raise(float_flag_invalid, status);
7103         }
7104         return float_relation_unordered;
7105     }
7106     aSign = extractFloat128Sign( a );
7107     bSign = extractFloat128Sign( b );
7108     if ( aSign != bSign ) {
7109         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7110             /* zero case */
7111             return float_relation_equal;
7112         } else {
7113             return 1 - (2 * aSign);
7114         }
7115     } else {
7116         if (a.low == b.low && a.high == b.high) {
7117             return float_relation_equal;
7118         } else {
7119             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7120         }
7121     }
7122 }
7123 
7124 int float128_compare(float128 a, float128 b, float_status *status)
7125 {
7126     return float128_compare_internal(a, b, 0, status);
7127 }
7128 
7129 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7130 {
7131     return float128_compare_internal(a, b, 1, status);
7132 }
7133 
7134 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7135 {
7136     flag aSign;
7137     int32_t aExp;
7138     uint64_t aSig;
7139 
7140     if (floatx80_invalid_encoding(a)) {
7141         float_raise(float_flag_invalid, status);
7142         return floatx80_default_nan(status);
7143     }
7144     aSig = extractFloatx80Frac( a );
7145     aExp = extractFloatx80Exp( a );
7146     aSign = extractFloatx80Sign( a );
7147 
7148     if ( aExp == 0x7FFF ) {
7149         if ( aSig<<1 ) {
7150             return propagateFloatx80NaN(a, a, status);
7151         }
7152         return a;
7153     }
7154 
7155     if (aExp == 0) {
7156         if (aSig == 0) {
7157             return a;
7158         }
7159         aExp++;
7160     }
7161 
7162     if (n > 0x10000) {
7163         n = 0x10000;
7164     } else if (n < -0x10000) {
7165         n = -0x10000;
7166     }
7167 
7168     aExp += n;
7169     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7170                                          aSign, aExp, aSig, 0, status);
7171 }
7172 
7173 float128 float128_scalbn(float128 a, int n, float_status *status)
7174 {
7175     flag aSign;
7176     int32_t aExp;
7177     uint64_t aSig0, aSig1;
7178 
7179     aSig1 = extractFloat128Frac1( a );
7180     aSig0 = extractFloat128Frac0( a );
7181     aExp = extractFloat128Exp( a );
7182     aSign = extractFloat128Sign( a );
7183     if ( aExp == 0x7FFF ) {
7184         if ( aSig0 | aSig1 ) {
7185             return propagateFloat128NaN(a, a, status);
7186         }
7187         return a;
7188     }
7189     if (aExp != 0) {
7190         aSig0 |= LIT64( 0x0001000000000000 );
7191     } else if (aSig0 == 0 && aSig1 == 0) {
7192         return a;
7193     } else {
7194         aExp++;
7195     }
7196 
7197     if (n > 0x10000) {
7198         n = 0x10000;
7199     } else if (n < -0x10000) {
7200         n = -0x10000;
7201     }
7202 
7203     aExp += n - 1;
7204     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7205                                          , status);
7206 
7207 }
7208