xref: /openbmc/qemu/fpu/softfloat.c (revision 38472890)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat32Frac(float32 a)
422 {
423     return float32_val(a) & 0x007FFFFF;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat32Exp(float32 a)
431 {
432     return (float32_val(a) >> 23) & 0xFF;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline flag extractFloat32Sign(float32 a)
440 {
441     return float32_val(a) >> 31;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline uint64_t extractFloat64Frac(float64 a)
449 {
450     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline int extractFloat64Exp(float64 a)
458 {
459     return (float64_val(a) >> 52) & 0x7FF;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline flag extractFloat64Sign(float64 a)
467 {
468     return float64_val(a) >> 63;
469 }
470 
471 /*
472  * Classify a floating point number. Everything above float_class_qnan
473  * is a NaN so cls >= float_class_qnan is any NaN.
474  */
475 
476 typedef enum __attribute__ ((__packed__)) {
477     float_class_unclassified,
478     float_class_zero,
479     float_class_normal,
480     float_class_inf,
481     float_class_qnan,  /* all NaNs from here */
482     float_class_snan,
483 } FloatClass;
484 
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused)) bool is_nan(FloatClass c)
487 {
488     return unlikely(c >= float_class_qnan);
489 }
490 
491 static inline __attribute__((unused)) bool is_snan(FloatClass c)
492 {
493     return c == float_class_snan;
494 }
495 
496 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
497 {
498     return c == float_class_qnan;
499 }
500 
501 /*
502  * Structure holding all of the decomposed parts of a float. The
503  * exponent is unbiased and the fraction is normalized. All
504  * calculations are done with a 64 bit fraction and then rounded as
505  * appropriate for the final format.
506  *
507  * Thanks to the packed FloatClass a decent compiler should be able to
508  * fit the whole structure into registers and avoid using the stack
509  * for parameter passing.
510  */
511 
512 typedef struct {
513     uint64_t frac;
514     int32_t  exp;
515     FloatClass cls;
516     bool sign;
517 } FloatParts;
518 
519 #define DECOMPOSED_BINARY_POINT    (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
522 
523 /* Structure holding all of the relevant parameters for a format.
524  *   exp_size: the size of the exponent field
525  *   exp_bias: the offset applied to the exponent field
526  *   exp_max: the maximum normalised exponent
527  *   frac_size: the size of the fraction field
528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529  * The following are computed based the size of fraction
530  *   frac_lsb: least significant bit of fraction
531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
532  *   round_mask/roundeven_mask: masks used for rounding
533  * The following optional modifiers are available:
534  *   arm_althp: handle ARM Alternative Half Precision
535  */
536 typedef struct {
537     int exp_size;
538     int exp_bias;
539     int exp_max;
540     int frac_size;
541     int frac_shift;
542     uint64_t frac_lsb;
543     uint64_t frac_lsbm1;
544     uint64_t round_mask;
545     uint64_t roundeven_mask;
546     bool arm_althp;
547 } FloatFmt;
548 
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F)                                           \
551     .exp_size       = E,                                             \
552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
553     .exp_max        = (1 << E) - 1,                                  \
554     .frac_size      = F,                                             \
555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560 
561 static const FloatFmt float16_params = {
562     FLOAT_PARAMS(5, 10)
563 };
564 
565 static const FloatFmt float16_params_ahp = {
566     FLOAT_PARAMS(5, 10),
567     .arm_althp = true
568 };
569 
570 static const FloatFmt float32_params = {
571     FLOAT_PARAMS(8, 23)
572 };
573 
574 static const FloatFmt float64_params = {
575     FLOAT_PARAMS(11, 52)
576 };
577 
578 /* Unpack a float to parts, but do not canonicalize.  */
579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
580 {
581     const int sign_pos = fmt.frac_size + fmt.exp_size;
582 
583     return (FloatParts) {
584         .cls = float_class_unclassified,
585         .sign = extract64(raw, sign_pos, 1),
586         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
587         .frac = extract64(raw, 0, fmt.frac_size),
588     };
589 }
590 
591 static inline FloatParts float16_unpack_raw(float16 f)
592 {
593     return unpack_raw(float16_params, f);
594 }
595 
596 static inline FloatParts float32_unpack_raw(float32 f)
597 {
598     return unpack_raw(float32_params, f);
599 }
600 
601 static inline FloatParts float64_unpack_raw(float64 f)
602 {
603     return unpack_raw(float64_params, f);
604 }
605 
606 /* Pack a float from parts, but do not canonicalize.  */
607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
608 {
609     const int sign_pos = fmt.frac_size + fmt.exp_size;
610     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
611     return deposit64(ret, sign_pos, 1, p.sign);
612 }
613 
614 static inline float16 float16_pack_raw(FloatParts p)
615 {
616     return make_float16(pack_raw(float16_params, p));
617 }
618 
619 static inline float32 float32_pack_raw(FloatParts p)
620 {
621     return make_float32(pack_raw(float32_params, p));
622 }
623 
624 static inline float64 float64_pack_raw(FloatParts p)
625 {
626     return make_float64(pack_raw(float64_params, p));
627 }
628 
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine:  (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output.  These details are target-
635 | specific.
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
638 
639 /* Canonicalize EXP and FRAC, setting CLS.  */
640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
641                                   float_status *status)
642 {
643     if (part.exp == parm->exp_max && !parm->arm_althp) {
644         if (part.frac == 0) {
645             part.cls = float_class_inf;
646         } else {
647             part.frac <<= parm->frac_shift;
648             part.cls = (parts_is_snan_frac(part.frac, status)
649                         ? float_class_snan : float_class_qnan);
650         }
651     } else if (part.exp == 0) {
652         if (likely(part.frac == 0)) {
653             part.cls = float_class_zero;
654         } else if (status->flush_inputs_to_zero) {
655             float_raise(float_flag_input_denormal, status);
656             part.cls = float_class_zero;
657             part.frac = 0;
658         } else {
659             int shift = clz64(part.frac) - 1;
660             part.cls = float_class_normal;
661             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
662             part.frac <<= shift;
663         }
664     } else {
665         part.cls = float_class_normal;
666         part.exp -= parm->exp_bias;
667         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
668     }
669     return part;
670 }
671 
672 /* Round and uncanonicalize a floating-point number by parts. There
673  * are FRAC_SHIFT bits that may require rounding at the bottom of the
674  * fraction; these bits will be removed. The exponent will be biased
675  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
676  */
677 
678 static FloatParts round_canonical(FloatParts p, float_status *s,
679                                   const FloatFmt *parm)
680 {
681     const uint64_t frac_lsb = parm->frac_lsb;
682     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
683     const uint64_t round_mask = parm->round_mask;
684     const uint64_t roundeven_mask = parm->roundeven_mask;
685     const int exp_max = parm->exp_max;
686     const int frac_shift = parm->frac_shift;
687     uint64_t frac, inc;
688     int exp, flags = 0;
689     bool overflow_norm;
690 
691     frac = p.frac;
692     exp = p.exp;
693 
694     switch (p.cls) {
695     case float_class_normal:
696         switch (s->float_rounding_mode) {
697         case float_round_nearest_even:
698             overflow_norm = false;
699             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
700             break;
701         case float_round_ties_away:
702             overflow_norm = false;
703             inc = frac_lsbm1;
704             break;
705         case float_round_to_zero:
706             overflow_norm = true;
707             inc = 0;
708             break;
709         case float_round_up:
710             inc = p.sign ? 0 : round_mask;
711             overflow_norm = p.sign;
712             break;
713         case float_round_down:
714             inc = p.sign ? round_mask : 0;
715             overflow_norm = !p.sign;
716             break;
717         case float_round_to_odd:
718             overflow_norm = true;
719             inc = frac & frac_lsb ? 0 : round_mask;
720             break;
721         default:
722             g_assert_not_reached();
723         }
724 
725         exp += parm->exp_bias;
726         if (likely(exp > 0)) {
727             if (frac & round_mask) {
728                 flags |= float_flag_inexact;
729                 frac += inc;
730                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
731                     frac >>= 1;
732                     exp++;
733                 }
734             }
735             frac >>= frac_shift;
736 
737             if (parm->arm_althp) {
738                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
739                 if (unlikely(exp > exp_max)) {
740                     /* Overflow.  Return the maximum normal.  */
741                     flags = float_flag_invalid;
742                     exp = exp_max;
743                     frac = -1;
744                 }
745             } else if (unlikely(exp >= exp_max)) {
746                 flags |= float_flag_overflow | float_flag_inexact;
747                 if (overflow_norm) {
748                     exp = exp_max - 1;
749                     frac = -1;
750                 } else {
751                     p.cls = float_class_inf;
752                     goto do_inf;
753                 }
754             }
755         } else if (s->flush_to_zero) {
756             flags |= float_flag_output_denormal;
757             p.cls = float_class_zero;
758             goto do_zero;
759         } else {
760             bool is_tiny = (s->float_detect_tininess
761                             == float_tininess_before_rounding)
762                         || (exp < 0)
763                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
764 
765             shift64RightJamming(frac, 1 - exp, &frac);
766             if (frac & round_mask) {
767                 /* Need to recompute round-to-even.  */
768                 switch (s->float_rounding_mode) {
769                 case float_round_nearest_even:
770                     inc = ((frac & roundeven_mask) != frac_lsbm1
771                            ? frac_lsbm1 : 0);
772                     break;
773                 case float_round_to_odd:
774                     inc = frac & frac_lsb ? 0 : round_mask;
775                     break;
776                 }
777                 flags |= float_flag_inexact;
778                 frac += inc;
779             }
780 
781             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
782             frac >>= frac_shift;
783 
784             if (is_tiny && (flags & float_flag_inexact)) {
785                 flags |= float_flag_underflow;
786             }
787             if (exp == 0 && frac == 0) {
788                 p.cls = float_class_zero;
789             }
790         }
791         break;
792 
793     case float_class_zero:
794     do_zero:
795         exp = 0;
796         frac = 0;
797         break;
798 
799     case float_class_inf:
800     do_inf:
801         assert(!parm->arm_althp);
802         exp = exp_max;
803         frac = 0;
804         break;
805 
806     case float_class_qnan:
807     case float_class_snan:
808         assert(!parm->arm_althp);
809         exp = exp_max;
810         frac >>= parm->frac_shift;
811         break;
812 
813     default:
814         g_assert_not_reached();
815     }
816 
817     float_raise(flags, s);
818     p.exp = exp;
819     p.frac = frac;
820     return p;
821 }
822 
823 /* Explicit FloatFmt version */
824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
825                                             const FloatFmt *params)
826 {
827     return sf_canonicalize(float16_unpack_raw(f), params, s);
828 }
829 
830 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
831 {
832     return float16a_unpack_canonical(f, s, &float16_params);
833 }
834 
835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
836                                              const FloatFmt *params)
837 {
838     return float16_pack_raw(round_canonical(p, s, params));
839 }
840 
841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
842 {
843     return float16a_round_pack_canonical(p, s, &float16_params);
844 }
845 
846 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
847 {
848     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
849 }
850 
851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
852 {
853     return float32_pack_raw(round_canonical(p, s, &float32_params));
854 }
855 
856 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
857 {
858     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
859 }
860 
861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
862 {
863     return float64_pack_raw(round_canonical(p, s, &float64_params));
864 }
865 
866 static FloatParts return_nan(FloatParts a, float_status *s)
867 {
868     switch (a.cls) {
869     case float_class_snan:
870         s->float_exception_flags |= float_flag_invalid;
871         a = parts_silence_nan(a, s);
872         /* fall through */
873     case float_class_qnan:
874         if (s->default_nan_mode) {
875             return parts_default_nan(s);
876         }
877         break;
878 
879     default:
880         g_assert_not_reached();
881     }
882     return a;
883 }
884 
885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
886 {
887     if (is_snan(a.cls) || is_snan(b.cls)) {
888         s->float_exception_flags |= float_flag_invalid;
889     }
890 
891     if (s->default_nan_mode) {
892         return parts_default_nan(s);
893     } else {
894         if (pickNaN(a.cls, b.cls,
895                     a.frac > b.frac ||
896                     (a.frac == b.frac && a.sign < b.sign))) {
897             a = b;
898         }
899         if (is_snan(a.cls)) {
900             return parts_silence_nan(a, s);
901         }
902     }
903     return a;
904 }
905 
906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
907                                   bool inf_zero, float_status *s)
908 {
909     int which;
910 
911     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
912         s->float_exception_flags |= float_flag_invalid;
913     }
914 
915     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
916 
917     if (s->default_nan_mode) {
918         /* Note that this check is after pickNaNMulAdd so that function
919          * has an opportunity to set the Invalid flag.
920          */
921         which = 3;
922     }
923 
924     switch (which) {
925     case 0:
926         break;
927     case 1:
928         a = b;
929         break;
930     case 2:
931         a = c;
932         break;
933     case 3:
934         return parts_default_nan(s);
935     default:
936         g_assert_not_reached();
937     }
938 
939     if (is_snan(a.cls)) {
940         return parts_silence_nan(a, s);
941     }
942     return a;
943 }
944 
945 /*
946  * Returns the result of adding or subtracting the values of the
947  * floating-point values `a' and `b'. The operation is performed
948  * according to the IEC/IEEE Standard for Binary Floating-Point
949  * Arithmetic.
950  */
951 
952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
953                                 float_status *s)
954 {
955     bool a_sign = a.sign;
956     bool b_sign = b.sign ^ subtract;
957 
958     if (a_sign != b_sign) {
959         /* Subtraction */
960 
961         if (a.cls == float_class_normal && b.cls == float_class_normal) {
962             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
963                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
964                 a.frac = a.frac - b.frac;
965             } else {
966                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
967                 a.frac = b.frac - a.frac;
968                 a.exp = b.exp;
969                 a_sign ^= 1;
970             }
971 
972             if (a.frac == 0) {
973                 a.cls = float_class_zero;
974                 a.sign = s->float_rounding_mode == float_round_down;
975             } else {
976                 int shift = clz64(a.frac) - 1;
977                 a.frac = a.frac << shift;
978                 a.exp = a.exp - shift;
979                 a.sign = a_sign;
980             }
981             return a;
982         }
983         if (is_nan(a.cls) || is_nan(b.cls)) {
984             return pick_nan(a, b, s);
985         }
986         if (a.cls == float_class_inf) {
987             if (b.cls == float_class_inf) {
988                 float_raise(float_flag_invalid, s);
989                 return parts_default_nan(s);
990             }
991             return a;
992         }
993         if (a.cls == float_class_zero && b.cls == float_class_zero) {
994             a.sign = s->float_rounding_mode == float_round_down;
995             return a;
996         }
997         if (a.cls == float_class_zero || b.cls == float_class_inf) {
998             b.sign = a_sign ^ 1;
999             return b;
1000         }
1001         if (b.cls == float_class_zero) {
1002             return a;
1003         }
1004     } else {
1005         /* Addition */
1006         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007             if (a.exp > b.exp) {
1008                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009             } else if (a.exp < b.exp) {
1010                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011                 a.exp = b.exp;
1012             }
1013             a.frac += b.frac;
1014             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015                 shift64RightJamming(a.frac, 1, &a.frac);
1016                 a.exp += 1;
1017             }
1018             return a;
1019         }
1020         if (is_nan(a.cls) || is_nan(b.cls)) {
1021             return pick_nan(a, b, s);
1022         }
1023         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024             return a;
1025         }
1026         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027             b.sign = b_sign;
1028             return b;
1029         }
1030     }
1031     g_assert_not_reached();
1032 }
1033 
1034 /*
1035  * Returns the result of adding or subtracting the floating-point
1036  * values `a' and `b'. The operation is performed according to the
1037  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038  */
1039 
1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1041 {
1042     FloatParts pa = float16_unpack_canonical(a, status);
1043     FloatParts pb = float16_unpack_canonical(b, status);
1044     FloatParts pr = addsub_floats(pa, pb, false, status);
1045 
1046     return float16_round_pack_canonical(pr, status);
1047 }
1048 
1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1050 {
1051     FloatParts pa = float16_unpack_canonical(a, status);
1052     FloatParts pb = float16_unpack_canonical(b, status);
1053     FloatParts pr = addsub_floats(pa, pb, true, status);
1054 
1055     return float16_round_pack_canonical(pr, status);
1056 }
1057 
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1060 {
1061     FloatParts pa = float32_unpack_canonical(a, status);
1062     FloatParts pb = float32_unpack_canonical(b, status);
1063     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1064 
1065     return float32_round_pack_canonical(pr, status);
1066 }
1067 
1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1069 {
1070     return soft_f32_addsub(a, b, false, status);
1071 }
1072 
1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1074 {
1075     return soft_f32_addsub(a, b, true, status);
1076 }
1077 
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1080 {
1081     FloatParts pa = float64_unpack_canonical(a, status);
1082     FloatParts pb = float64_unpack_canonical(b, status);
1083     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1084 
1085     return float64_round_pack_canonical(pr, status);
1086 }
1087 
1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1089 {
1090     return soft_f64_addsub(a, b, false, status);
1091 }
1092 
1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1094 {
1095     return soft_f64_addsub(a, b, true, status);
1096 }
1097 
1098 static float hard_f32_add(float a, float b)
1099 {
1100     return a + b;
1101 }
1102 
1103 static float hard_f32_sub(float a, float b)
1104 {
1105     return a - b;
1106 }
1107 
1108 static double hard_f64_add(double a, double b)
1109 {
1110     return a + b;
1111 }
1112 
1113 static double hard_f64_sub(double a, double b)
1114 {
1115     return a - b;
1116 }
1117 
1118 static bool f32_addsub_post(union_float32 a, union_float32 b)
1119 {
1120     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1122     }
1123     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1124 }
1125 
1126 static bool f64_addsub_post(union_float64 a, union_float64 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     } else {
1131         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1132     }
1133 }
1134 
1135 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1137 {
1138     return float32_gen2(a, b, s, hard, soft,
1139                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1140 }
1141 
1142 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1144 {
1145     return float64_gen2(a, b, s, hard, soft,
1146                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1147 }
1148 
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a, float32 b, float_status *s)
1151 {
1152     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1153 }
1154 
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a, float32 b, float_status *s)
1157 {
1158     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1159 }
1160 
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a, float64 b, float_status *s)
1163 {
1164     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1165 }
1166 
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a, float64 b, float_status *s)
1169 {
1170     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1171 }
1172 
1173 /*
1174  * Returns the result of multiplying the floating-point values `a' and
1175  * `b'. The operation is performed according to the IEC/IEEE Standard
1176  * for Binary Floating-Point Arithmetic.
1177  */
1178 
1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1180 {
1181     bool sign = a.sign ^ b.sign;
1182 
1183     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184         uint64_t hi, lo;
1185         int exp = a.exp + b.exp;
1186 
1187         mul64To128(a.frac, b.frac, &hi, &lo);
1188         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190             shift64RightJamming(lo, 1, &lo);
1191             exp += 1;
1192         }
1193 
1194         /* Re-use a */
1195         a.exp = exp;
1196         a.sign = sign;
1197         a.frac = lo;
1198         return a;
1199     }
1200     /* handle all the NaN cases */
1201     if (is_nan(a.cls) || is_nan(b.cls)) {
1202         return pick_nan(a, b, s);
1203     }
1204     /* Inf * Zero == NaN */
1205     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207         s->float_exception_flags |= float_flag_invalid;
1208         return parts_default_nan(s);
1209     }
1210     /* Multiply by 0 or Inf */
1211     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212         a.sign = sign;
1213         return a;
1214     }
1215     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216         b.sign = sign;
1217         return b;
1218     }
1219     g_assert_not_reached();
1220 }
1221 
1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1223 {
1224     FloatParts pa = float16_unpack_canonical(a, status);
1225     FloatParts pb = float16_unpack_canonical(b, status);
1226     FloatParts pr = mul_floats(pa, pb, status);
1227 
1228     return float16_round_pack_canonical(pr, status);
1229 }
1230 
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a, float32 b, float_status *status)
1233 {
1234     FloatParts pa = float32_unpack_canonical(a, status);
1235     FloatParts pb = float32_unpack_canonical(b, status);
1236     FloatParts pr = mul_floats(pa, pb, status);
1237 
1238     return float32_round_pack_canonical(pr, status);
1239 }
1240 
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a, float64 b, float_status *status)
1243 {
1244     FloatParts pa = float64_unpack_canonical(a, status);
1245     FloatParts pb = float64_unpack_canonical(b, status);
1246     FloatParts pr = mul_floats(pa, pb, status);
1247 
1248     return float64_round_pack_canonical(pr, status);
1249 }
1250 
1251 static float hard_f32_mul(float a, float b)
1252 {
1253     return a * b;
1254 }
1255 
1256 static double hard_f64_mul(double a, double b)
1257 {
1258     return a * b;
1259 }
1260 
1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1262 {
1263     return float32_is_zero(a.s) || float32_is_zero(b.s);
1264 }
1265 
1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1267 {
1268     return float64_is_zero(a.s) || float64_is_zero(b.s);
1269 }
1270 
1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1272 {
1273     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1274 
1275     return float32_set_sign(float32_zero, signbit);
1276 }
1277 
1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1279 {
1280     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1281 
1282     return float64_set_sign(float64_zero, signbit);
1283 }
1284 
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a, float32 b, float_status *s)
1287 {
1288     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1290 }
1291 
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a, float64 b, float_status *s)
1294 {
1295     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1297 }
1298 
1299 /*
1300  * Returns the result of multiplying the floating-point values `a' and
1301  * `b' then adding 'c', with no intermediate rounding step after the
1302  * multiplication. The operation is performed according to the
1303  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304  * The flags argument allows the caller to select negation of the
1305  * addend, the intermediate product, or the final result. (The
1306  * difference between this and having the caller do a separate
1307  * negation is that negating externally will flip the sign bit on
1308  * NaNs.)
1309  */
1310 
1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312                                 int flags, float_status *s)
1313 {
1314     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315                     ((1 << float_class_inf) | (1 << float_class_zero));
1316     bool p_sign;
1317     bool sign_flip = flags & float_muladd_negate_result;
1318     FloatClass p_class;
1319     uint64_t hi, lo;
1320     int p_exp;
1321 
1322     /* It is implementation-defined whether the cases of (0,inf,qnan)
1323      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324      * they return if they do), so we have to hand this information
1325      * off to the target-specific pick-a-NaN routine.
1326      */
1327     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328         return pick_nan_muladd(a, b, c, inf_zero, s);
1329     }
1330 
1331     if (inf_zero) {
1332         s->float_exception_flags |= float_flag_invalid;
1333         return parts_default_nan(s);
1334     }
1335 
1336     if (flags & float_muladd_negate_c) {
1337         c.sign ^= 1;
1338     }
1339 
1340     p_sign = a.sign ^ b.sign;
1341 
1342     if (flags & float_muladd_negate_product) {
1343         p_sign ^= 1;
1344     }
1345 
1346     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347         p_class = float_class_inf;
1348     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349         p_class = float_class_zero;
1350     } else {
1351         p_class = float_class_normal;
1352     }
1353 
1354     if (c.cls == float_class_inf) {
1355         if (p_class == float_class_inf && p_sign != c.sign) {
1356             s->float_exception_flags |= float_flag_invalid;
1357             return parts_default_nan(s);
1358         } else {
1359             a.cls = float_class_inf;
1360             a.sign = c.sign ^ sign_flip;
1361             return a;
1362         }
1363     }
1364 
1365     if (p_class == float_class_inf) {
1366         a.cls = float_class_inf;
1367         a.sign = p_sign ^ sign_flip;
1368         return a;
1369     }
1370 
1371     if (p_class == float_class_zero) {
1372         if (c.cls == float_class_zero) {
1373             if (p_sign != c.sign) {
1374                 p_sign = s->float_rounding_mode == float_round_down;
1375             }
1376             c.sign = p_sign;
1377         } else if (flags & float_muladd_halve_result) {
1378             c.exp -= 1;
1379         }
1380         c.sign ^= sign_flip;
1381         return c;
1382     }
1383 
1384     /* a & b should be normals now... */
1385     assert(a.cls == float_class_normal &&
1386            b.cls == float_class_normal);
1387 
1388     p_exp = a.exp + b.exp;
1389 
1390     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391      * result.
1392      */
1393     mul64To128(a.frac, b.frac, &hi, &lo);
1394     /* binary point now at bit 124 */
1395 
1396     /* check for overflow */
1397     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398         shift128RightJamming(hi, lo, 1, &hi, &lo);
1399         p_exp += 1;
1400     }
1401 
1402     /* + add/sub */
1403     if (c.cls == float_class_zero) {
1404         /* move binary point back to 62 */
1405         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406     } else {
1407         int exp_diff = p_exp - c.exp;
1408         if (p_sign == c.sign) {
1409             /* Addition */
1410             if (exp_diff <= 0) {
1411                 shift128RightJamming(hi, lo,
1412                                      DECOMPOSED_BINARY_POINT - exp_diff,
1413                                      &hi, &lo);
1414                 lo += c.frac;
1415                 p_exp = c.exp;
1416             } else {
1417                 uint64_t c_hi, c_lo;
1418                 /* shift c to the same binary point as the product (124) */
1419                 c_hi = c.frac >> 2;
1420                 c_lo = 0;
1421                 shift128RightJamming(c_hi, c_lo,
1422                                      exp_diff,
1423                                      &c_hi, &c_lo);
1424                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425                 /* move binary point back to 62 */
1426                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1427             }
1428 
1429             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430                 shift64RightJamming(lo, 1, &lo);
1431                 p_exp += 1;
1432             }
1433 
1434         } else {
1435             /* Subtraction */
1436             uint64_t c_hi, c_lo;
1437             /* make C binary point match product at bit 124 */
1438             c_hi = c.frac >> 2;
1439             c_lo = 0;
1440 
1441             if (exp_diff <= 0) {
1442                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443                 if (exp_diff == 0
1444                     &&
1445                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447                 } else {
1448                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449                     p_sign ^= 1;
1450                     p_exp = c.exp;
1451                 }
1452             } else {
1453                 shift128RightJamming(c_hi, c_lo,
1454                                      exp_diff,
1455                                      &c_hi, &c_lo);
1456                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1457             }
1458 
1459             if (hi == 0 && lo == 0) {
1460                 a.cls = float_class_zero;
1461                 a.sign = s->float_rounding_mode == float_round_down;
1462                 a.sign ^= sign_flip;
1463                 return a;
1464             } else {
1465                 int shift;
1466                 if (hi != 0) {
1467                     shift = clz64(hi);
1468                 } else {
1469                     shift = clz64(lo) + 64;
1470                 }
1471                 /* Normalizing to a binary point of 124 is the
1472                    correct adjust for the exponent.  However since we're
1473                    shifting, we might as well put the binary point back
1474                    at 62 where we really want it.  Therefore shift as
1475                    if we're leaving 1 bit at the top of the word, but
1476                    adjust the exponent as if we're leaving 3 bits.  */
1477                 shift -= 1;
1478                 if (shift >= 64) {
1479                     lo = lo << (shift - 64);
1480                 } else {
1481                     hi = (hi << shift) | (lo >> (64 - shift));
1482                     lo = hi | ((lo << shift) != 0);
1483                 }
1484                 p_exp -= shift - 2;
1485             }
1486         }
1487     }
1488 
1489     if (flags & float_muladd_halve_result) {
1490         p_exp -= 1;
1491     }
1492 
1493     /* finally prepare our result */
1494     a.cls = float_class_normal;
1495     a.sign = p_sign ^ sign_flip;
1496     a.exp = p_exp;
1497     a.frac = lo;
1498 
1499     return a;
1500 }
1501 
1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503                                                 int flags, float_status *status)
1504 {
1505     FloatParts pa = float16_unpack_canonical(a, status);
1506     FloatParts pb = float16_unpack_canonical(b, status);
1507     FloatParts pc = float16_unpack_canonical(c, status);
1508     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1509 
1510     return float16_round_pack_canonical(pr, status);
1511 }
1512 
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515                 float_status *status)
1516 {
1517     FloatParts pa = float32_unpack_canonical(a, status);
1518     FloatParts pb = float32_unpack_canonical(b, status);
1519     FloatParts pc = float32_unpack_canonical(c, status);
1520     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1521 
1522     return float32_round_pack_canonical(pr, status);
1523 }
1524 
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527                 float_status *status)
1528 {
1529     FloatParts pa = float64_unpack_canonical(a, status);
1530     FloatParts pb = float64_unpack_canonical(b, status);
1531     FloatParts pc = float64_unpack_canonical(c, status);
1532     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533 
1534     return float64_round_pack_canonical(pr, status);
1535 }
1536 
1537 static bool force_soft_fma;
1538 
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1541 {
1542     union_float32 ua, ub, uc, ur;
1543 
1544     ua.s = xa;
1545     ub.s = xb;
1546     uc.s = xc;
1547 
1548     if (unlikely(!can_use_fpu(s))) {
1549         goto soft;
1550     }
1551     if (unlikely(flags & float_muladd_halve_result)) {
1552         goto soft;
1553     }
1554 
1555     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557         goto soft;
1558     }
1559 
1560     if (unlikely(force_soft_fma)) {
1561         goto soft;
1562     }
1563 
1564     /*
1565      * When (a || b) == 0, there's no need to check for under/over flow,
1566      * since we know the addend is (normal || 0) and the product is 0.
1567      */
1568     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569         union_float32 up;
1570         bool prod_sign;
1571 
1572         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573         prod_sign ^= !!(flags & float_muladd_negate_product);
1574         up.s = float32_set_sign(float32_zero, prod_sign);
1575 
1576         if (flags & float_muladd_negate_c) {
1577             uc.h = -uc.h;
1578         }
1579         ur.h = up.h + uc.h;
1580     } else {
1581         union_float32 ua_orig = ua;
1582         union_float32 uc_orig = uc;
1583 
1584         if (flags & float_muladd_negate_product) {
1585             ua.h = -ua.h;
1586         }
1587         if (flags & float_muladd_negate_c) {
1588             uc.h = -uc.h;
1589         }
1590 
1591         ur.h = fmaf(ua.h, ub.h, uc.h);
1592 
1593         if (unlikely(f32_is_inf(ur))) {
1594             s->float_exception_flags |= float_flag_overflow;
1595         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596             ua = ua_orig;
1597             uc = uc_orig;
1598             goto soft;
1599         }
1600     }
1601     if (flags & float_muladd_negate_result) {
1602         return float32_chs(ur.s);
1603     }
1604     return ur.s;
1605 
1606  soft:
1607     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1608 }
1609 
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1612 {
1613     union_float64 ua, ub, uc, ur;
1614 
1615     ua.s = xa;
1616     ub.s = xb;
1617     uc.s = xc;
1618 
1619     if (unlikely(!can_use_fpu(s))) {
1620         goto soft;
1621     }
1622     if (unlikely(flags & float_muladd_halve_result)) {
1623         goto soft;
1624     }
1625 
1626     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628         goto soft;
1629     }
1630 
1631     if (unlikely(force_soft_fma)) {
1632         goto soft;
1633     }
1634 
1635     /*
1636      * When (a || b) == 0, there's no need to check for under/over flow,
1637      * since we know the addend is (normal || 0) and the product is 0.
1638      */
1639     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640         union_float64 up;
1641         bool prod_sign;
1642 
1643         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644         prod_sign ^= !!(flags & float_muladd_negate_product);
1645         up.s = float64_set_sign(float64_zero, prod_sign);
1646 
1647         if (flags & float_muladd_negate_c) {
1648             uc.h = -uc.h;
1649         }
1650         ur.h = up.h + uc.h;
1651     } else {
1652         union_float64 ua_orig = ua;
1653         union_float64 uc_orig = uc;
1654 
1655         if (flags & float_muladd_negate_product) {
1656             ua.h = -ua.h;
1657         }
1658         if (flags & float_muladd_negate_c) {
1659             uc.h = -uc.h;
1660         }
1661 
1662         ur.h = fma(ua.h, ub.h, uc.h);
1663 
1664         if (unlikely(f64_is_inf(ur))) {
1665             s->float_exception_flags |= float_flag_overflow;
1666         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667             ua = ua_orig;
1668             uc = uc_orig;
1669             goto soft;
1670         }
1671     }
1672     if (flags & float_muladd_negate_result) {
1673         return float64_chs(ur.s);
1674     }
1675     return ur.s;
1676 
1677  soft:
1678     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1679 }
1680 
1681 /*
1682  * Returns the result of dividing the floating-point value `a' by the
1683  * corresponding value `b'. The operation is performed according to
1684  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685  */
1686 
1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1688 {
1689     bool sign = a.sign ^ b.sign;
1690 
1691     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692         uint64_t n0, n1, q, r;
1693         int exp = a.exp - b.exp;
1694 
1695         /*
1696          * We want a 2*N / N-bit division to produce exactly an N-bit
1697          * result, so that we do not lose any precision and so that we
1698          * do not have to renormalize afterward.  If A.frac < B.frac,
1699          * then division would produce an (N-1)-bit result; shift A left
1700          * by one to produce the an N-bit result, and decrement the
1701          * exponent to match.
1702          *
1703          * The udiv_qrnnd algorithm that we're using requires normalization,
1704          * i.e. the msb of the denominator must be set.  Since we know that
1705          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706          * by one (more), and the remainder must be shifted right by one.
1707          */
1708         if (a.frac < b.frac) {
1709             exp -= 1;
1710             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711         } else {
1712             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1713         }
1714         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715 
1716         /*
1717          * Set lsb if there is a remainder, to set inexact.
1718          * As mentioned above, to find the actual value of the remainder we
1719          * would need to shift right, but (1) we are only concerned about
1720          * non-zero-ness, and (2) the remainder will always be even because
1721          * both inputs to the division primitive are even.
1722          */
1723         a.frac = q | (r != 0);
1724         a.sign = sign;
1725         a.exp = exp;
1726         return a;
1727     }
1728     /* handle all the NaN cases */
1729     if (is_nan(a.cls) || is_nan(b.cls)) {
1730         return pick_nan(a, b, s);
1731     }
1732     /* 0/0 or Inf/Inf */
1733     if (a.cls == b.cls
1734         &&
1735         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736         s->float_exception_flags |= float_flag_invalid;
1737         return parts_default_nan(s);
1738     }
1739     /* Inf / x or 0 / x */
1740     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741         a.sign = sign;
1742         return a;
1743     }
1744     /* Div 0 => Inf */
1745     if (b.cls == float_class_zero) {
1746         s->float_exception_flags |= float_flag_divbyzero;
1747         a.cls = float_class_inf;
1748         a.sign = sign;
1749         return a;
1750     }
1751     /* Div by Inf */
1752     if (b.cls == float_class_inf) {
1753         a.cls = float_class_zero;
1754         a.sign = sign;
1755         return a;
1756     }
1757     g_assert_not_reached();
1758 }
1759 
1760 float16 float16_div(float16 a, float16 b, float_status *status)
1761 {
1762     FloatParts pa = float16_unpack_canonical(a, status);
1763     FloatParts pb = float16_unpack_canonical(b, status);
1764     FloatParts pr = div_floats(pa, pb, status);
1765 
1766     return float16_round_pack_canonical(pr, status);
1767 }
1768 
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a, float32 b, float_status *status)
1771 {
1772     FloatParts pa = float32_unpack_canonical(a, status);
1773     FloatParts pb = float32_unpack_canonical(b, status);
1774     FloatParts pr = div_floats(pa, pb, status);
1775 
1776     return float32_round_pack_canonical(pr, status);
1777 }
1778 
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a, float64 b, float_status *status)
1781 {
1782     FloatParts pa = float64_unpack_canonical(a, status);
1783     FloatParts pb = float64_unpack_canonical(b, status);
1784     FloatParts pr = div_floats(pa, pb, status);
1785 
1786     return float64_round_pack_canonical(pr, status);
1787 }
1788 
1789 static float hard_f32_div(float a, float b)
1790 {
1791     return a / b;
1792 }
1793 
1794 static double hard_f64_div(double a, double b)
1795 {
1796     return a / b;
1797 }
1798 
1799 static bool f32_div_pre(union_float32 a, union_float32 b)
1800 {
1801     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803                fpclassify(b.h) == FP_NORMAL;
1804     }
1805     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806 }
1807 
1808 static bool f64_div_pre(union_float64 a, union_float64 b)
1809 {
1810     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812                fpclassify(b.h) == FP_NORMAL;
1813     }
1814     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815 }
1816 
1817 static bool f32_div_post(union_float32 a, union_float32 b)
1818 {
1819     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820         return fpclassify(a.h) != FP_ZERO;
1821     }
1822     return !float32_is_zero(a.s);
1823 }
1824 
1825 static bool f64_div_post(union_float64 a, union_float64 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float64_is_zero(a.s);
1831 }
1832 
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a, float32 b, float_status *s)
1835 {
1836     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837                         f32_div_pre, f32_div_post, NULL, NULL);
1838 }
1839 
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a, float64 b, float_status *s)
1842 {
1843     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844                         f64_div_pre, f64_div_post, NULL, NULL);
1845 }
1846 
1847 /*
1848  * Float to Float conversions
1849  *
1850  * Returns the result of converting one float format to another. The
1851  * conversion is performed according to the IEC/IEEE Standard for
1852  * Binary Floating-Point Arithmetic.
1853  *
1854  * The float_to_float helper only needs to take care of raising
1855  * invalid exceptions and handling the conversion on NaNs.
1856  */
1857 
1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859                                  float_status *s)
1860 {
1861     if (dstf->arm_althp) {
1862         switch (a.cls) {
1863         case float_class_qnan:
1864         case float_class_snan:
1865             /* There is no NaN in the destination format.  Raise Invalid
1866              * and return a zero with the sign of the input NaN.
1867              */
1868             s->float_exception_flags |= float_flag_invalid;
1869             a.cls = float_class_zero;
1870             a.frac = 0;
1871             a.exp = 0;
1872             break;
1873 
1874         case float_class_inf:
1875             /* There is no Inf in the destination format.  Raise Invalid
1876              * and return the maximum normal with the correct sign.
1877              */
1878             s->float_exception_flags |= float_flag_invalid;
1879             a.cls = float_class_normal;
1880             a.exp = dstf->exp_max;
1881             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882             break;
1883 
1884         default:
1885             break;
1886         }
1887     } else if (is_nan(a.cls)) {
1888         if (is_snan(a.cls)) {
1889             s->float_exception_flags |= float_flag_invalid;
1890             a = parts_silence_nan(a, s);
1891         }
1892         if (s->default_nan_mode) {
1893             return parts_default_nan(s);
1894         }
1895     }
1896     return a;
1897 }
1898 
1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1900 {
1901     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903     FloatParts pr = float_to_float(p, &float32_params, s);
1904     return float32_round_pack_canonical(pr, s);
1905 }
1906 
1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float64_params, s);
1912     return float64_round_pack_canonical(pr, s);
1913 }
1914 
1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float32_unpack_canonical(a, s);
1919     FloatParts pr = float_to_float(p, fmt16, s);
1920     return float16a_round_pack_canonical(pr, s, fmt16);
1921 }
1922 
1923 static float64 QEMU_SOFTFLOAT_ATTR
1924 soft_float32_to_float64(float32 a, float_status *s)
1925 {
1926     FloatParts p = float32_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float64_params, s);
1928     return float64_round_pack_canonical(pr, s);
1929 }
1930 
1931 float64 float32_to_float64(float32 a, float_status *s)
1932 {
1933     if (likely(float32_is_normal(a))) {
1934         /* Widening conversion can never produce inexact results.  */
1935         union_float32 uf;
1936         union_float64 ud;
1937         uf.s = a;
1938         ud.h = uf.h;
1939         return ud.s;
1940     } else if (float32_is_zero(a)) {
1941         return float64_set_sign(float64_zero, float32_is_neg(a));
1942     } else {
1943         return soft_float32_to_float64(a, s);
1944     }
1945 }
1946 
1947 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1948 {
1949     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1950     FloatParts p = float64_unpack_canonical(a, s);
1951     FloatParts pr = float_to_float(p, fmt16, s);
1952     return float16a_round_pack_canonical(pr, s, fmt16);
1953 }
1954 
1955 float32 float64_to_float32(float64 a, float_status *s)
1956 {
1957     FloatParts p = float64_unpack_canonical(a, s);
1958     FloatParts pr = float_to_float(p, &float32_params, s);
1959     return float32_round_pack_canonical(pr, s);
1960 }
1961 
1962 /*
1963  * Rounds the floating-point value `a' to an integer, and returns the
1964  * result as a floating-point value. The operation is performed
1965  * according to the IEC/IEEE Standard for Binary Floating-Point
1966  * Arithmetic.
1967  */
1968 
1969 static FloatParts round_to_int(FloatParts a, int rmode,
1970                                int scale, float_status *s)
1971 {
1972     switch (a.cls) {
1973     case float_class_qnan:
1974     case float_class_snan:
1975         return return_nan(a, s);
1976 
1977     case float_class_zero:
1978     case float_class_inf:
1979         /* already "integral" */
1980         break;
1981 
1982     case float_class_normal:
1983         scale = MIN(MAX(scale, -0x10000), 0x10000);
1984         a.exp += scale;
1985 
1986         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1987             /* already integral */
1988             break;
1989         }
1990         if (a.exp < 0) {
1991             bool one;
1992             /* all fractional */
1993             s->float_exception_flags |= float_flag_inexact;
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1997                 break;
1998             case float_round_ties_away:
1999                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2000                 break;
2001             case float_round_to_zero:
2002                 one = false;
2003                 break;
2004             case float_round_up:
2005                 one = !a.sign;
2006                 break;
2007             case float_round_down:
2008                 one = a.sign;
2009                 break;
2010             case float_round_to_odd:
2011                 one = true;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (one) {
2018                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2019                 a.exp = 0;
2020             } else {
2021                 a.cls = float_class_zero;
2022             }
2023         } else {
2024             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2025             uint64_t frac_lsbm1 = frac_lsb >> 1;
2026             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2027             uint64_t rnd_mask = rnd_even_mask >> 1;
2028             uint64_t inc;
2029 
2030             switch (rmode) {
2031             case float_round_nearest_even:
2032                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2033                 break;
2034             case float_round_ties_away:
2035                 inc = frac_lsbm1;
2036                 break;
2037             case float_round_to_zero:
2038                 inc = 0;
2039                 break;
2040             case float_round_up:
2041                 inc = a.sign ? 0 : rnd_mask;
2042                 break;
2043             case float_round_down:
2044                 inc = a.sign ? rnd_mask : 0;
2045                 break;
2046             case float_round_to_odd:
2047                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2048                 break;
2049             default:
2050                 g_assert_not_reached();
2051             }
2052 
2053             if (a.frac & rnd_mask) {
2054                 s->float_exception_flags |= float_flag_inexact;
2055                 a.frac += inc;
2056                 a.frac &= ~rnd_mask;
2057                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2058                     a.frac >>= 1;
2059                     a.exp++;
2060                 }
2061             }
2062         }
2063         break;
2064     default:
2065         g_assert_not_reached();
2066     }
2067     return a;
2068 }
2069 
2070 float16 float16_round_to_int(float16 a, float_status *s)
2071 {
2072     FloatParts pa = float16_unpack_canonical(a, s);
2073     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2074     return float16_round_pack_canonical(pr, s);
2075 }
2076 
2077 float32 float32_round_to_int(float32 a, float_status *s)
2078 {
2079     FloatParts pa = float32_unpack_canonical(a, s);
2080     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2081     return float32_round_pack_canonical(pr, s);
2082 }
2083 
2084 float64 float64_round_to_int(float64 a, float_status *s)
2085 {
2086     FloatParts pa = float64_unpack_canonical(a, s);
2087     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2088     return float64_round_pack_canonical(pr, s);
2089 }
2090 
2091 /*
2092  * Returns the result of converting the floating-point value `a' to
2093  * the two's complement integer format. The conversion is performed
2094  * according to the IEC/IEEE Standard for Binary Floating-Point
2095  * Arithmetic---which means in particular that the conversion is
2096  * rounded according to the current rounding mode. If `a' is a NaN,
2097  * the largest positive integer is returned. Otherwise, if the
2098  * conversion overflows, the largest integer with the same sign as `a'
2099  * is returned.
2100 */
2101 
2102 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2103                                      int64_t min, int64_t max,
2104                                      float_status *s)
2105 {
2106     uint64_t r;
2107     int orig_flags = get_float_exception_flags(s);
2108     FloatParts p = round_to_int(in, rmode, scale, s);
2109 
2110     switch (p.cls) {
2111     case float_class_snan:
2112     case float_class_qnan:
2113         s->float_exception_flags = orig_flags | float_flag_invalid;
2114         return max;
2115     case float_class_inf:
2116         s->float_exception_flags = orig_flags | float_flag_invalid;
2117         return p.sign ? min : max;
2118     case float_class_zero:
2119         return 0;
2120     case float_class_normal:
2121         if (p.exp < DECOMPOSED_BINARY_POINT) {
2122             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2123         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2124             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2125         } else {
2126             r = UINT64_MAX;
2127         }
2128         if (p.sign) {
2129             if (r <= -(uint64_t) min) {
2130                 return -r;
2131             } else {
2132                 s->float_exception_flags = orig_flags | float_flag_invalid;
2133                 return min;
2134             }
2135         } else {
2136             if (r <= max) {
2137                 return r;
2138             } else {
2139                 s->float_exception_flags = orig_flags | float_flag_invalid;
2140                 return max;
2141             }
2142         }
2143     default:
2144         g_assert_not_reached();
2145     }
2146 }
2147 
2148 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2149                                 float_status *s)
2150 {
2151     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2152                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2153 }
2154 
2155 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2156                                 float_status *s)
2157 {
2158     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2159                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2160 }
2161 
2162 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2163                                 float_status *s)
2164 {
2165     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2166                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2167 }
2168 
2169 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2170                                 float_status *s)
2171 {
2172     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2173                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2174 }
2175 
2176 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2177                                 float_status *s)
2178 {
2179     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2180                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2181 }
2182 
2183 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2184                                 float_status *s)
2185 {
2186     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2187                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2188 }
2189 
2190 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2191                                 float_status *s)
2192 {
2193     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2194                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2195 }
2196 
2197 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2198                                 float_status *s)
2199 {
2200     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2201                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2202 }
2203 
2204 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2205                                 float_status *s)
2206 {
2207     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2208                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2209 }
2210 
2211 int16_t float16_to_int16(float16 a, float_status *s)
2212 {
2213     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2214 }
2215 
2216 int32_t float16_to_int32(float16 a, float_status *s)
2217 {
2218     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2219 }
2220 
2221 int64_t float16_to_int64(float16 a, float_status *s)
2222 {
2223     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2224 }
2225 
2226 int16_t float32_to_int16(float32 a, float_status *s)
2227 {
2228     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2229 }
2230 
2231 int32_t float32_to_int32(float32 a, float_status *s)
2232 {
2233     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2234 }
2235 
2236 int64_t float32_to_int64(float32 a, float_status *s)
2237 {
2238     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2239 }
2240 
2241 int16_t float64_to_int16(float64 a, float_status *s)
2242 {
2243     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2244 }
2245 
2246 int32_t float64_to_int32(float64 a, float_status *s)
2247 {
2248     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2249 }
2250 
2251 int64_t float64_to_int64(float64 a, float_status *s)
2252 {
2253     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2254 }
2255 
2256 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2257 {
2258     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2259 }
2260 
2261 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2262 {
2263     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2264 }
2265 
2266 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2267 {
2268     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2269 }
2270 
2271 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2272 {
2273     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2274 }
2275 
2276 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2277 {
2278     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2279 }
2280 
2281 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2282 {
2283     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2284 }
2285 
2286 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2287 {
2288     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2289 }
2290 
2291 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2292 {
2293     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2294 }
2295 
2296 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2297 {
2298     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2299 }
2300 
2301 /*
2302  *  Returns the result of converting the floating-point value `a' to
2303  *  the unsigned integer format. The conversion is performed according
2304  *  to the IEC/IEEE Standard for Binary Floating-Point
2305  *  Arithmetic---which means in particular that the conversion is
2306  *  rounded according to the current rounding mode. If `a' is a NaN,
2307  *  the largest unsigned integer is returned. Otherwise, if the
2308  *  conversion overflows, the largest unsigned integer is returned. If
2309  *  the 'a' is negative, the result is rounded and zero is returned;
2310  *  values that do not round to zero will raise the inexact exception
2311  *  flag.
2312  */
2313 
2314 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2315                                        uint64_t max, float_status *s)
2316 {
2317     int orig_flags = get_float_exception_flags(s);
2318     FloatParts p = round_to_int(in, rmode, scale, s);
2319     uint64_t r;
2320 
2321     switch (p.cls) {
2322     case float_class_snan:
2323     case float_class_qnan:
2324         s->float_exception_flags = orig_flags | float_flag_invalid;
2325         return max;
2326     case float_class_inf:
2327         s->float_exception_flags = orig_flags | float_flag_invalid;
2328         return p.sign ? 0 : max;
2329     case float_class_zero:
2330         return 0;
2331     case float_class_normal:
2332         if (p.sign) {
2333             s->float_exception_flags = orig_flags | float_flag_invalid;
2334             return 0;
2335         }
2336 
2337         if (p.exp < DECOMPOSED_BINARY_POINT) {
2338             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2339         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2340             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2341         } else {
2342             s->float_exception_flags = orig_flags | float_flag_invalid;
2343             return max;
2344         }
2345 
2346         /* For uint64 this will never trip, but if p.exp is too large
2347          * to shift a decomposed fraction we shall have exited via the
2348          * 3rd leg above.
2349          */
2350         if (r > max) {
2351             s->float_exception_flags = orig_flags | float_flag_invalid;
2352             return max;
2353         }
2354         return r;
2355     default:
2356         g_assert_not_reached();
2357     }
2358 }
2359 
2360 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2364                                   rmode, scale, UINT16_MAX, s);
2365 }
2366 
2367 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2371                                   rmode, scale, UINT32_MAX, s);
2372 }
2373 
2374 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2378                                   rmode, scale, UINT64_MAX, s);
2379 }
2380 
2381 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2385                                   rmode, scale, UINT16_MAX, s);
2386 }
2387 
2388 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2389                                   float_status *s)
2390 {
2391     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2392                                   rmode, scale, UINT32_MAX, s);
2393 }
2394 
2395 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2396                                   float_status *s)
2397 {
2398     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2399                                   rmode, scale, UINT64_MAX, s);
2400 }
2401 
2402 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2403                                   float_status *s)
2404 {
2405     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2406                                   rmode, scale, UINT16_MAX, s);
2407 }
2408 
2409 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2410                                   float_status *s)
2411 {
2412     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2413                                   rmode, scale, UINT32_MAX, s);
2414 }
2415 
2416 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2417                                   float_status *s)
2418 {
2419     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2420                                   rmode, scale, UINT64_MAX, s);
2421 }
2422 
2423 uint16_t float16_to_uint16(float16 a, float_status *s)
2424 {
2425     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint32_t float16_to_uint32(float16 a, float_status *s)
2429 {
2430     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint64_t float16_to_uint64(float16 a, float_status *s)
2434 {
2435     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2436 }
2437 
2438 uint16_t float32_to_uint16(float32 a, float_status *s)
2439 {
2440     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2441 }
2442 
2443 uint32_t float32_to_uint32(float32 a, float_status *s)
2444 {
2445     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2446 }
2447 
2448 uint64_t float32_to_uint64(float32 a, float_status *s)
2449 {
2450     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2451 }
2452 
2453 uint16_t float64_to_uint16(float64 a, float_status *s)
2454 {
2455     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2456 }
2457 
2458 uint32_t float64_to_uint32(float64 a, float_status *s)
2459 {
2460     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2461 }
2462 
2463 uint64_t float64_to_uint64(float64 a, float_status *s)
2464 {
2465     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2466 }
2467 
2468 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2469 {
2470     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2474 {
2475     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2479 {
2480     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2481 }
2482 
2483 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2484 {
2485     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2486 }
2487 
2488 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2489 {
2490     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2491 }
2492 
2493 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2494 {
2495     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2496 }
2497 
2498 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2499 {
2500     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2501 }
2502 
2503 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2504 {
2505     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2506 }
2507 
2508 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2509 {
2510     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2511 }
2512 
2513 /*
2514  * Integer to float conversions
2515  *
2516  * Returns the result of converting the two's complement integer `a'
2517  * to the floating-point format. The conversion is performed according
2518  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2519  */
2520 
2521 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2522 {
2523     FloatParts r = { .sign = false };
2524 
2525     if (a == 0) {
2526         r.cls = float_class_zero;
2527     } else {
2528         uint64_t f = a;
2529         int shift;
2530 
2531         r.cls = float_class_normal;
2532         if (a < 0) {
2533             f = -f;
2534             r.sign = true;
2535         }
2536         shift = clz64(f) - 1;
2537         scale = MIN(MAX(scale, -0x10000), 0x10000);
2538 
2539         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2540         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2541     }
2542 
2543     return r;
2544 }
2545 
2546 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2547 {
2548     FloatParts pa = int_to_float(a, scale, status);
2549     return float16_round_pack_canonical(pa, status);
2550 }
2551 
2552 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2553 {
2554     return int64_to_float16_scalbn(a, scale, status);
2555 }
2556 
2557 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2558 {
2559     return int64_to_float16_scalbn(a, scale, status);
2560 }
2561 
2562 float16 int64_to_float16(int64_t a, float_status *status)
2563 {
2564     return int64_to_float16_scalbn(a, 0, status);
2565 }
2566 
2567 float16 int32_to_float16(int32_t a, float_status *status)
2568 {
2569     return int64_to_float16_scalbn(a, 0, status);
2570 }
2571 
2572 float16 int16_to_float16(int16_t a, float_status *status)
2573 {
2574     return int64_to_float16_scalbn(a, 0, status);
2575 }
2576 
2577 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2578 {
2579     FloatParts pa = int_to_float(a, scale, status);
2580     return float32_round_pack_canonical(pa, status);
2581 }
2582 
2583 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2584 {
2585     return int64_to_float32_scalbn(a, scale, status);
2586 }
2587 
2588 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2589 {
2590     return int64_to_float32_scalbn(a, scale, status);
2591 }
2592 
2593 float32 int64_to_float32(int64_t a, float_status *status)
2594 {
2595     return int64_to_float32_scalbn(a, 0, status);
2596 }
2597 
2598 float32 int32_to_float32(int32_t a, float_status *status)
2599 {
2600     return int64_to_float32_scalbn(a, 0, status);
2601 }
2602 
2603 float32 int16_to_float32(int16_t a, float_status *status)
2604 {
2605     return int64_to_float32_scalbn(a, 0, status);
2606 }
2607 
2608 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2609 {
2610     FloatParts pa = int_to_float(a, scale, status);
2611     return float64_round_pack_canonical(pa, status);
2612 }
2613 
2614 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2615 {
2616     return int64_to_float64_scalbn(a, scale, status);
2617 }
2618 
2619 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2620 {
2621     return int64_to_float64_scalbn(a, scale, status);
2622 }
2623 
2624 float64 int64_to_float64(int64_t a, float_status *status)
2625 {
2626     return int64_to_float64_scalbn(a, 0, status);
2627 }
2628 
2629 float64 int32_to_float64(int32_t a, float_status *status)
2630 {
2631     return int64_to_float64_scalbn(a, 0, status);
2632 }
2633 
2634 float64 int16_to_float64(int16_t a, float_status *status)
2635 {
2636     return int64_to_float64_scalbn(a, 0, status);
2637 }
2638 
2639 
2640 /*
2641  * Unsigned Integer to float conversions
2642  *
2643  * Returns the result of converting the unsigned integer `a' to the
2644  * floating-point format. The conversion is performed according to the
2645  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2646  */
2647 
2648 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2649 {
2650     FloatParts r = { .sign = false };
2651 
2652     if (a == 0) {
2653         r.cls = float_class_zero;
2654     } else {
2655         scale = MIN(MAX(scale, -0x10000), 0x10000);
2656         r.cls = float_class_normal;
2657         if ((int64_t)a < 0) {
2658             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2659             shift64RightJamming(a, 1, &a);
2660             r.frac = a;
2661         } else {
2662             int shift = clz64(a) - 1;
2663             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2664             r.frac = a << shift;
2665         }
2666     }
2667 
2668     return r;
2669 }
2670 
2671 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2672 {
2673     FloatParts pa = uint_to_float(a, scale, status);
2674     return float16_round_pack_canonical(pa, status);
2675 }
2676 
2677 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2678 {
2679     return uint64_to_float16_scalbn(a, scale, status);
2680 }
2681 
2682 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2683 {
2684     return uint64_to_float16_scalbn(a, scale, status);
2685 }
2686 
2687 float16 uint64_to_float16(uint64_t a, float_status *status)
2688 {
2689     return uint64_to_float16_scalbn(a, 0, status);
2690 }
2691 
2692 float16 uint32_to_float16(uint32_t a, float_status *status)
2693 {
2694     return uint64_to_float16_scalbn(a, 0, status);
2695 }
2696 
2697 float16 uint16_to_float16(uint16_t a, float_status *status)
2698 {
2699     return uint64_to_float16_scalbn(a, 0, status);
2700 }
2701 
2702 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2703 {
2704     FloatParts pa = uint_to_float(a, scale, status);
2705     return float32_round_pack_canonical(pa, status);
2706 }
2707 
2708 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2709 {
2710     return uint64_to_float32_scalbn(a, scale, status);
2711 }
2712 
2713 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2714 {
2715     return uint64_to_float32_scalbn(a, scale, status);
2716 }
2717 
2718 float32 uint64_to_float32(uint64_t a, float_status *status)
2719 {
2720     return uint64_to_float32_scalbn(a, 0, status);
2721 }
2722 
2723 float32 uint32_to_float32(uint32_t a, float_status *status)
2724 {
2725     return uint64_to_float32_scalbn(a, 0, status);
2726 }
2727 
2728 float32 uint16_to_float32(uint16_t a, float_status *status)
2729 {
2730     return uint64_to_float32_scalbn(a, 0, status);
2731 }
2732 
2733 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2734 {
2735     FloatParts pa = uint_to_float(a, scale, status);
2736     return float64_round_pack_canonical(pa, status);
2737 }
2738 
2739 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2740 {
2741     return uint64_to_float64_scalbn(a, scale, status);
2742 }
2743 
2744 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2745 {
2746     return uint64_to_float64_scalbn(a, scale, status);
2747 }
2748 
2749 float64 uint64_to_float64(uint64_t a, float_status *status)
2750 {
2751     return uint64_to_float64_scalbn(a, 0, status);
2752 }
2753 
2754 float64 uint32_to_float64(uint32_t a, float_status *status)
2755 {
2756     return uint64_to_float64_scalbn(a, 0, status);
2757 }
2758 
2759 float64 uint16_to_float64(uint16_t a, float_status *status)
2760 {
2761     return uint64_to_float64_scalbn(a, 0, status);
2762 }
2763 
2764 /* Float Min/Max */
2765 /* min() and max() functions. These can't be implemented as
2766  * 'compare and pick one input' because that would mishandle
2767  * NaNs and +0 vs -0.
2768  *
2769  * minnum() and maxnum() functions. These are similar to the min()
2770  * and max() functions but if one of the arguments is a QNaN and
2771  * the other is numerical then the numerical argument is returned.
2772  * SNaNs will get quietened before being returned.
2773  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2774  * and maxNum() operations. min() and max() are the typical min/max
2775  * semantics provided by many CPUs which predate that specification.
2776  *
2777  * minnummag() and maxnummag() functions correspond to minNumMag()
2778  * and minNumMag() from the IEEE-754 2008.
2779  */
2780 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2781                                 bool ieee, bool ismag, float_status *s)
2782 {
2783     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2784         if (ieee) {
2785             /* Takes two floating-point values `a' and `b', one of
2786              * which is a NaN, and returns the appropriate NaN
2787              * result. If either `a' or `b' is a signaling NaN,
2788              * the invalid exception is raised.
2789              */
2790             if (is_snan(a.cls) || is_snan(b.cls)) {
2791                 return pick_nan(a, b, s);
2792             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2793                 return b;
2794             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2795                 return a;
2796             }
2797         }
2798         return pick_nan(a, b, s);
2799     } else {
2800         int a_exp, b_exp;
2801 
2802         switch (a.cls) {
2803         case float_class_normal:
2804             a_exp = a.exp;
2805             break;
2806         case float_class_inf:
2807             a_exp = INT_MAX;
2808             break;
2809         case float_class_zero:
2810             a_exp = INT_MIN;
2811             break;
2812         default:
2813             g_assert_not_reached();
2814             break;
2815         }
2816         switch (b.cls) {
2817         case float_class_normal:
2818             b_exp = b.exp;
2819             break;
2820         case float_class_inf:
2821             b_exp = INT_MAX;
2822             break;
2823         case float_class_zero:
2824             b_exp = INT_MIN;
2825             break;
2826         default:
2827             g_assert_not_reached();
2828             break;
2829         }
2830 
2831         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2832             bool a_less = a_exp < b_exp;
2833             if (a_exp == b_exp) {
2834                 a_less = a.frac < b.frac;
2835             }
2836             return a_less ^ ismin ? b : a;
2837         }
2838 
2839         if (a.sign == b.sign) {
2840             bool a_less = a_exp < b_exp;
2841             if (a_exp == b_exp) {
2842                 a_less = a.frac < b.frac;
2843             }
2844             return a.sign ^ a_less ^ ismin ? b : a;
2845         } else {
2846             return a.sign ^ ismin ? b : a;
2847         }
2848     }
2849 }
2850 
2851 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2852 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2853                                      float_status *s)                   \
2854 {                                                                       \
2855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2857     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2858                                                                         \
2859     return float ## sz ## _round_pack_canonical(pr, s);                 \
2860 }
2861 
2862 MINMAX(16, min, true, false, false)
2863 MINMAX(16, minnum, true, true, false)
2864 MINMAX(16, minnummag, true, true, true)
2865 MINMAX(16, max, false, false, false)
2866 MINMAX(16, maxnum, false, true, false)
2867 MINMAX(16, maxnummag, false, true, true)
2868 
2869 MINMAX(32, min, true, false, false)
2870 MINMAX(32, minnum, true, true, false)
2871 MINMAX(32, minnummag, true, true, true)
2872 MINMAX(32, max, false, false, false)
2873 MINMAX(32, maxnum, false, true, false)
2874 MINMAX(32, maxnummag, false, true, true)
2875 
2876 MINMAX(64, min, true, false, false)
2877 MINMAX(64, minnum, true, true, false)
2878 MINMAX(64, minnummag, true, true, true)
2879 MINMAX(64, max, false, false, false)
2880 MINMAX(64, maxnum, false, true, false)
2881 MINMAX(64, maxnummag, false, true, true)
2882 
2883 #undef MINMAX
2884 
2885 /* Floating point compare */
2886 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2887                           float_status *s)
2888 {
2889     if (is_nan(a.cls) || is_nan(b.cls)) {
2890         if (!is_quiet ||
2891             a.cls == float_class_snan ||
2892             b.cls == float_class_snan) {
2893             s->float_exception_flags |= float_flag_invalid;
2894         }
2895         return float_relation_unordered;
2896     }
2897 
2898     if (a.cls == float_class_zero) {
2899         if (b.cls == float_class_zero) {
2900             return float_relation_equal;
2901         }
2902         return b.sign ? float_relation_greater : float_relation_less;
2903     } else if (b.cls == float_class_zero) {
2904         return a.sign ? float_relation_less : float_relation_greater;
2905     }
2906 
2907     /* The only really important thing about infinity is its sign. If
2908      * both are infinities the sign marks the smallest of the two.
2909      */
2910     if (a.cls == float_class_inf) {
2911         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2912             return float_relation_equal;
2913         }
2914         return a.sign ? float_relation_less : float_relation_greater;
2915     } else if (b.cls == float_class_inf) {
2916         return b.sign ? float_relation_greater : float_relation_less;
2917     }
2918 
2919     if (a.sign != b.sign) {
2920         return a.sign ? float_relation_less : float_relation_greater;
2921     }
2922 
2923     if (a.exp == b.exp) {
2924         if (a.frac == b.frac) {
2925             return float_relation_equal;
2926         }
2927         if (a.sign) {
2928             return a.frac > b.frac ?
2929                 float_relation_less : float_relation_greater;
2930         } else {
2931             return a.frac > b.frac ?
2932                 float_relation_greater : float_relation_less;
2933         }
2934     } else {
2935         if (a.sign) {
2936             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2937         } else {
2938             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2939         }
2940     }
2941 }
2942 
2943 #define COMPARE(name, attr, sz)                                         \
2944 static int attr                                                         \
2945 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2946 {                                                                       \
2947     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2948     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2949     return compare_floats(pa, pb, is_quiet, s);                         \
2950 }
2951 
2952 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2953 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2954 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2955 
2956 #undef COMPARE
2957 
2958 int float16_compare(float16 a, float16 b, float_status *s)
2959 {
2960     return soft_f16_compare(a, b, false, s);
2961 }
2962 
2963 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2964 {
2965     return soft_f16_compare(a, b, true, s);
2966 }
2967 
2968 static int QEMU_FLATTEN
2969 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2970 {
2971     union_float32 ua, ub;
2972 
2973     ua.s = xa;
2974     ub.s = xb;
2975 
2976     if (QEMU_NO_HARDFLOAT) {
2977         goto soft;
2978     }
2979 
2980     float32_input_flush2(&ua.s, &ub.s, s);
2981     if (isgreaterequal(ua.h, ub.h)) {
2982         if (isgreater(ua.h, ub.h)) {
2983             return float_relation_greater;
2984         }
2985         return float_relation_equal;
2986     }
2987     if (likely(isless(ua.h, ub.h))) {
2988         return float_relation_less;
2989     }
2990     /* The only condition remaining is unordered.
2991      * Fall through to set flags.
2992      */
2993  soft:
2994     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2995 }
2996 
2997 int float32_compare(float32 a, float32 b, float_status *s)
2998 {
2999     return f32_compare(a, b, false, s);
3000 }
3001 
3002 int float32_compare_quiet(float32 a, float32 b, float_status *s)
3003 {
3004     return f32_compare(a, b, true, s);
3005 }
3006 
3007 static int QEMU_FLATTEN
3008 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3009 {
3010     union_float64 ua, ub;
3011 
3012     ua.s = xa;
3013     ub.s = xb;
3014 
3015     if (QEMU_NO_HARDFLOAT) {
3016         goto soft;
3017     }
3018 
3019     float64_input_flush2(&ua.s, &ub.s, s);
3020     if (isgreaterequal(ua.h, ub.h)) {
3021         if (isgreater(ua.h, ub.h)) {
3022             return float_relation_greater;
3023         }
3024         return float_relation_equal;
3025     }
3026     if (likely(isless(ua.h, ub.h))) {
3027         return float_relation_less;
3028     }
3029     /* The only condition remaining is unordered.
3030      * Fall through to set flags.
3031      */
3032  soft:
3033     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3034 }
3035 
3036 int float64_compare(float64 a, float64 b, float_status *s)
3037 {
3038     return f64_compare(a, b, false, s);
3039 }
3040 
3041 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3042 {
3043     return f64_compare(a, b, true, s);
3044 }
3045 
3046 /* Multiply A by 2 raised to the power N.  */
3047 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3048 {
3049     if (unlikely(is_nan(a.cls))) {
3050         return return_nan(a, s);
3051     }
3052     if (a.cls == float_class_normal) {
3053         /* The largest float type (even though not supported by FloatParts)
3054          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3055          * still allows rounding to infinity, without allowing overflow
3056          * within the int32_t that backs FloatParts.exp.
3057          */
3058         n = MIN(MAX(n, -0x10000), 0x10000);
3059         a.exp += n;
3060     }
3061     return a;
3062 }
3063 
3064 float16 float16_scalbn(float16 a, int n, float_status *status)
3065 {
3066     FloatParts pa = float16_unpack_canonical(a, status);
3067     FloatParts pr = scalbn_decomposed(pa, n, status);
3068     return float16_round_pack_canonical(pr, status);
3069 }
3070 
3071 float32 float32_scalbn(float32 a, int n, float_status *status)
3072 {
3073     FloatParts pa = float32_unpack_canonical(a, status);
3074     FloatParts pr = scalbn_decomposed(pa, n, status);
3075     return float32_round_pack_canonical(pr, status);
3076 }
3077 
3078 float64 float64_scalbn(float64 a, int n, float_status *status)
3079 {
3080     FloatParts pa = float64_unpack_canonical(a, status);
3081     FloatParts pr = scalbn_decomposed(pa, n, status);
3082     return float64_round_pack_canonical(pr, status);
3083 }
3084 
3085 /*
3086  * Square Root
3087  *
3088  * The old softfloat code did an approximation step before zeroing in
3089  * on the final result. However for simpleness we just compute the
3090  * square root by iterating down from the implicit bit to enough extra
3091  * bits to ensure we get a correctly rounded result.
3092  *
3093  * This does mean however the calculation is slower than before,
3094  * especially for 64 bit floats.
3095  */
3096 
3097 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3098 {
3099     uint64_t a_frac, r_frac, s_frac;
3100     int bit, last_bit;
3101 
3102     if (is_nan(a.cls)) {
3103         return return_nan(a, s);
3104     }
3105     if (a.cls == float_class_zero) {
3106         return a;  /* sqrt(+-0) = +-0 */
3107     }
3108     if (a.sign) {
3109         s->float_exception_flags |= float_flag_invalid;
3110         return parts_default_nan(s);
3111     }
3112     if (a.cls == float_class_inf) {
3113         return a;  /* sqrt(+inf) = +inf */
3114     }
3115 
3116     assert(a.cls == float_class_normal);
3117 
3118     /* We need two overflow bits at the top. Adding room for that is a
3119      * right shift. If the exponent is odd, we can discard the low bit
3120      * by multiplying the fraction by 2; that's a left shift. Combine
3121      * those and we shift right if the exponent is even.
3122      */
3123     a_frac = a.frac;
3124     if (!(a.exp & 1)) {
3125         a_frac >>= 1;
3126     }
3127     a.exp >>= 1;
3128 
3129     /* Bit-by-bit computation of sqrt.  */
3130     r_frac = 0;
3131     s_frac = 0;
3132 
3133     /* Iterate from implicit bit down to the 3 extra bits to compute a
3134      * properly rounded result. Remember we've inserted one more bit
3135      * at the top, so these positions are one less.
3136      */
3137     bit = DECOMPOSED_BINARY_POINT - 1;
3138     last_bit = MAX(p->frac_shift - 4, 0);
3139     do {
3140         uint64_t q = 1ULL << bit;
3141         uint64_t t_frac = s_frac + q;
3142         if (t_frac <= a_frac) {
3143             s_frac = t_frac + q;
3144             a_frac -= t_frac;
3145             r_frac += q;
3146         }
3147         a_frac <<= 1;
3148     } while (--bit >= last_bit);
3149 
3150     /* Undo the right shift done above. If there is any remaining
3151      * fraction, the result is inexact. Set the sticky bit.
3152      */
3153     a.frac = (r_frac << 1) + (a_frac != 0);
3154 
3155     return a;
3156 }
3157 
3158 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3159 {
3160     FloatParts pa = float16_unpack_canonical(a, status);
3161     FloatParts pr = sqrt_float(pa, status, &float16_params);
3162     return float16_round_pack_canonical(pr, status);
3163 }
3164 
3165 static float32 QEMU_SOFTFLOAT_ATTR
3166 soft_f32_sqrt(float32 a, float_status *status)
3167 {
3168     FloatParts pa = float32_unpack_canonical(a, status);
3169     FloatParts pr = sqrt_float(pa, status, &float32_params);
3170     return float32_round_pack_canonical(pr, status);
3171 }
3172 
3173 static float64 QEMU_SOFTFLOAT_ATTR
3174 soft_f64_sqrt(float64 a, float_status *status)
3175 {
3176     FloatParts pa = float64_unpack_canonical(a, status);
3177     FloatParts pr = sqrt_float(pa, status, &float64_params);
3178     return float64_round_pack_canonical(pr, status);
3179 }
3180 
3181 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3182 {
3183     union_float32 ua, ur;
3184 
3185     ua.s = xa;
3186     if (unlikely(!can_use_fpu(s))) {
3187         goto soft;
3188     }
3189 
3190     float32_input_flush1(&ua.s, s);
3191     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3192         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3193                        fpclassify(ua.h) == FP_ZERO) ||
3194                      signbit(ua.h))) {
3195             goto soft;
3196         }
3197     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3198                         float32_is_neg(ua.s))) {
3199         goto soft;
3200     }
3201     ur.h = sqrtf(ua.h);
3202     return ur.s;
3203 
3204  soft:
3205     return soft_f32_sqrt(ua.s, s);
3206 }
3207 
3208 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3209 {
3210     union_float64 ua, ur;
3211 
3212     ua.s = xa;
3213     if (unlikely(!can_use_fpu(s))) {
3214         goto soft;
3215     }
3216 
3217     float64_input_flush1(&ua.s, s);
3218     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3219         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3220                        fpclassify(ua.h) == FP_ZERO) ||
3221                      signbit(ua.h))) {
3222             goto soft;
3223         }
3224     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3225                         float64_is_neg(ua.s))) {
3226         goto soft;
3227     }
3228     ur.h = sqrt(ua.h);
3229     return ur.s;
3230 
3231  soft:
3232     return soft_f64_sqrt(ua.s, s);
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | The pattern for a default generated NaN.
3237 *----------------------------------------------------------------------------*/
3238 
3239 float16 float16_default_nan(float_status *status)
3240 {
3241     FloatParts p = parts_default_nan(status);
3242     p.frac >>= float16_params.frac_shift;
3243     return float16_pack_raw(p);
3244 }
3245 
3246 float32 float32_default_nan(float_status *status)
3247 {
3248     FloatParts p = parts_default_nan(status);
3249     p.frac >>= float32_params.frac_shift;
3250     return float32_pack_raw(p);
3251 }
3252 
3253 float64 float64_default_nan(float_status *status)
3254 {
3255     FloatParts p = parts_default_nan(status);
3256     p.frac >>= float64_params.frac_shift;
3257     return float64_pack_raw(p);
3258 }
3259 
3260 float128 float128_default_nan(float_status *status)
3261 {
3262     FloatParts p = parts_default_nan(status);
3263     float128 r;
3264 
3265     /* Extrapolate from the choices made by parts_default_nan to fill
3266      * in the quad-floating format.  If the low bit is set, assume we
3267      * want to set all non-snan bits.
3268      */
3269     r.low = -(p.frac & 1);
3270     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3271     r.high |= UINT64_C(0x7FFF000000000000);
3272     r.high |= (uint64_t)p.sign << 63;
3273 
3274     return r;
3275 }
3276 
3277 /*----------------------------------------------------------------------------
3278 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3279 *----------------------------------------------------------------------------*/
3280 
3281 float16 float16_silence_nan(float16 a, float_status *status)
3282 {
3283     FloatParts p = float16_unpack_raw(a);
3284     p.frac <<= float16_params.frac_shift;
3285     p = parts_silence_nan(p, status);
3286     p.frac >>= float16_params.frac_shift;
3287     return float16_pack_raw(p);
3288 }
3289 
3290 float32 float32_silence_nan(float32 a, float_status *status)
3291 {
3292     FloatParts p = float32_unpack_raw(a);
3293     p.frac <<= float32_params.frac_shift;
3294     p = parts_silence_nan(p, status);
3295     p.frac >>= float32_params.frac_shift;
3296     return float32_pack_raw(p);
3297 }
3298 
3299 float64 float64_silence_nan(float64 a, float_status *status)
3300 {
3301     FloatParts p = float64_unpack_raw(a);
3302     p.frac <<= float64_params.frac_shift;
3303     p = parts_silence_nan(p, status);
3304     p.frac >>= float64_params.frac_shift;
3305     return float64_pack_raw(p);
3306 }
3307 
3308 
3309 /*----------------------------------------------------------------------------
3310 | If `a' is denormal and we are in flush-to-zero mode then set the
3311 | input-denormal exception and return zero. Otherwise just return the value.
3312 *----------------------------------------------------------------------------*/
3313 
3314 static bool parts_squash_denormal(FloatParts p, float_status *status)
3315 {
3316     if (p.exp == 0 && p.frac != 0) {
3317         float_raise(float_flag_input_denormal, status);
3318         return true;
3319     }
3320 
3321     return false;
3322 }
3323 
3324 float16 float16_squash_input_denormal(float16 a, float_status *status)
3325 {
3326     if (status->flush_inputs_to_zero) {
3327         FloatParts p = float16_unpack_raw(a);
3328         if (parts_squash_denormal(p, status)) {
3329             return float16_set_sign(float16_zero, p.sign);
3330         }
3331     }
3332     return a;
3333 }
3334 
3335 float32 float32_squash_input_denormal(float32 a, float_status *status)
3336 {
3337     if (status->flush_inputs_to_zero) {
3338         FloatParts p = float32_unpack_raw(a);
3339         if (parts_squash_denormal(p, status)) {
3340             return float32_set_sign(float32_zero, p.sign);
3341         }
3342     }
3343     return a;
3344 }
3345 
3346 float64 float64_squash_input_denormal(float64 a, float_status *status)
3347 {
3348     if (status->flush_inputs_to_zero) {
3349         FloatParts p = float64_unpack_raw(a);
3350         if (parts_squash_denormal(p, status)) {
3351             return float64_set_sign(float64_zero, p.sign);
3352         }
3353     }
3354     return a;
3355 }
3356 
3357 /*----------------------------------------------------------------------------
3358 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3359 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3360 | input.  If `zSign' is 1, the input is negated before being converted to an
3361 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3362 | is simply rounded to an integer, with the inexact exception raised if the
3363 | input cannot be represented exactly as an integer.  However, if the fixed-
3364 | point input is too large, the invalid exception is raised and the largest
3365 | positive or negative integer is returned.
3366 *----------------------------------------------------------------------------*/
3367 
3368 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3369 {
3370     int8_t roundingMode;
3371     flag roundNearestEven;
3372     int8_t roundIncrement, roundBits;
3373     int32_t z;
3374 
3375     roundingMode = status->float_rounding_mode;
3376     roundNearestEven = ( roundingMode == float_round_nearest_even );
3377     switch (roundingMode) {
3378     case float_round_nearest_even:
3379     case float_round_ties_away:
3380         roundIncrement = 0x40;
3381         break;
3382     case float_round_to_zero:
3383         roundIncrement = 0;
3384         break;
3385     case float_round_up:
3386         roundIncrement = zSign ? 0 : 0x7f;
3387         break;
3388     case float_round_down:
3389         roundIncrement = zSign ? 0x7f : 0;
3390         break;
3391     case float_round_to_odd:
3392         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3393         break;
3394     default:
3395         abort();
3396     }
3397     roundBits = absZ & 0x7F;
3398     absZ = ( absZ + roundIncrement )>>7;
3399     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3400     z = absZ;
3401     if ( zSign ) z = - z;
3402     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3403         float_raise(float_flag_invalid, status);
3404         return zSign ? INT32_MIN : INT32_MAX;
3405     }
3406     if (roundBits) {
3407         status->float_exception_flags |= float_flag_inexact;
3408     }
3409     return z;
3410 
3411 }
3412 
3413 /*----------------------------------------------------------------------------
3414 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3415 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3416 | and returns the properly rounded 64-bit integer corresponding to the input.
3417 | If `zSign' is 1, the input is negated before being converted to an integer.
3418 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3419 | the inexact exception raised if the input cannot be represented exactly as
3420 | an integer.  However, if the fixed-point input is too large, the invalid
3421 | exception is raised and the largest positive or negative integer is
3422 | returned.
3423 *----------------------------------------------------------------------------*/
3424 
3425 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3426                                float_status *status)
3427 {
3428     int8_t roundingMode;
3429     flag roundNearestEven, increment;
3430     int64_t z;
3431 
3432     roundingMode = status->float_rounding_mode;
3433     roundNearestEven = ( roundingMode == float_round_nearest_even );
3434     switch (roundingMode) {
3435     case float_round_nearest_even:
3436     case float_round_ties_away:
3437         increment = ((int64_t) absZ1 < 0);
3438         break;
3439     case float_round_to_zero:
3440         increment = 0;
3441         break;
3442     case float_round_up:
3443         increment = !zSign && absZ1;
3444         break;
3445     case float_round_down:
3446         increment = zSign && absZ1;
3447         break;
3448     case float_round_to_odd:
3449         increment = !(absZ0 & 1) && absZ1;
3450         break;
3451     default:
3452         abort();
3453     }
3454     if ( increment ) {
3455         ++absZ0;
3456         if ( absZ0 == 0 ) goto overflow;
3457         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3458     }
3459     z = absZ0;
3460     if ( zSign ) z = - z;
3461     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3462  overflow:
3463         float_raise(float_flag_invalid, status);
3464         return zSign ? INT64_MIN : INT64_MAX;
3465     }
3466     if (absZ1) {
3467         status->float_exception_flags |= float_flag_inexact;
3468     }
3469     return z;
3470 
3471 }
3472 
3473 /*----------------------------------------------------------------------------
3474 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3475 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3476 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3477 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3478 | with the inexact exception raised if the input cannot be represented exactly
3479 | as an integer.  However, if the fixed-point input is too large, the invalid
3480 | exception is raised and the largest unsigned integer is returned.
3481 *----------------------------------------------------------------------------*/
3482 
3483 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3484                                 uint64_t absZ1, float_status *status)
3485 {
3486     int8_t roundingMode;
3487     flag roundNearestEven, increment;
3488 
3489     roundingMode = status->float_rounding_mode;
3490     roundNearestEven = (roundingMode == float_round_nearest_even);
3491     switch (roundingMode) {
3492     case float_round_nearest_even:
3493     case float_round_ties_away:
3494         increment = ((int64_t)absZ1 < 0);
3495         break;
3496     case float_round_to_zero:
3497         increment = 0;
3498         break;
3499     case float_round_up:
3500         increment = !zSign && absZ1;
3501         break;
3502     case float_round_down:
3503         increment = zSign && absZ1;
3504         break;
3505     case float_round_to_odd:
3506         increment = !(absZ0 & 1) && absZ1;
3507         break;
3508     default:
3509         abort();
3510     }
3511     if (increment) {
3512         ++absZ0;
3513         if (absZ0 == 0) {
3514             float_raise(float_flag_invalid, status);
3515             return UINT64_MAX;
3516         }
3517         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3518     }
3519 
3520     if (zSign && absZ0) {
3521         float_raise(float_flag_invalid, status);
3522         return 0;
3523     }
3524 
3525     if (absZ1) {
3526         status->float_exception_flags |= float_flag_inexact;
3527     }
3528     return absZ0;
3529 }
3530 
3531 /*----------------------------------------------------------------------------
3532 | Normalizes the subnormal single-precision floating-point value represented
3533 | by the denormalized significand `aSig'.  The normalized exponent and
3534 | significand are stored at the locations pointed to by `zExpPtr' and
3535 | `zSigPtr', respectively.
3536 *----------------------------------------------------------------------------*/
3537 
3538 static void
3539  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3540 {
3541     int8_t shiftCount;
3542 
3543     shiftCount = clz32(aSig) - 8;
3544     *zSigPtr = aSig<<shiftCount;
3545     *zExpPtr = 1 - shiftCount;
3546 
3547 }
3548 
3549 /*----------------------------------------------------------------------------
3550 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3551 | and significand `zSig', and returns the proper single-precision floating-
3552 | point value corresponding to the abstract input.  Ordinarily, the abstract
3553 | value is simply rounded and packed into the single-precision format, with
3554 | the inexact exception raised if the abstract input cannot be represented
3555 | exactly.  However, if the abstract value is too large, the overflow and
3556 | inexact exceptions are raised and an infinity or maximal finite value is
3557 | returned.  If the abstract value is too small, the input value is rounded to
3558 | a subnormal number, and the underflow and inexact exceptions are raised if
3559 | the abstract input cannot be represented exactly as a subnormal single-
3560 | precision floating-point number.
3561 |     The input significand `zSig' has its binary point between bits 30
3562 | and 29, which is 7 bits to the left of the usual location.  This shifted
3563 | significand must be normalized or smaller.  If `zSig' is not normalized,
3564 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3565 | and it must not require rounding.  In the usual case that `zSig' is
3566 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3567 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3568 | Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3570 
3571 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3572                                    float_status *status)
3573 {
3574     int8_t roundingMode;
3575     flag roundNearestEven;
3576     int8_t roundIncrement, roundBits;
3577     flag isTiny;
3578 
3579     roundingMode = status->float_rounding_mode;
3580     roundNearestEven = ( roundingMode == float_round_nearest_even );
3581     switch (roundingMode) {
3582     case float_round_nearest_even:
3583     case float_round_ties_away:
3584         roundIncrement = 0x40;
3585         break;
3586     case float_round_to_zero:
3587         roundIncrement = 0;
3588         break;
3589     case float_round_up:
3590         roundIncrement = zSign ? 0 : 0x7f;
3591         break;
3592     case float_round_down:
3593         roundIncrement = zSign ? 0x7f : 0;
3594         break;
3595     case float_round_to_odd:
3596         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3597         break;
3598     default:
3599         abort();
3600         break;
3601     }
3602     roundBits = zSig & 0x7F;
3603     if ( 0xFD <= (uint16_t) zExp ) {
3604         if (    ( 0xFD < zExp )
3605              || (    ( zExp == 0xFD )
3606                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3607            ) {
3608             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3609                                    roundIncrement != 0;
3610             float_raise(float_flag_overflow | float_flag_inexact, status);
3611             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3612         }
3613         if ( zExp < 0 ) {
3614             if (status->flush_to_zero) {
3615                 float_raise(float_flag_output_denormal, status);
3616                 return packFloat32(zSign, 0, 0);
3617             }
3618             isTiny =
3619                 (status->float_detect_tininess
3620                  == float_tininess_before_rounding)
3621                 || ( zExp < -1 )
3622                 || ( zSig + roundIncrement < 0x80000000 );
3623             shift32RightJamming( zSig, - zExp, &zSig );
3624             zExp = 0;
3625             roundBits = zSig & 0x7F;
3626             if (isTiny && roundBits) {
3627                 float_raise(float_flag_underflow, status);
3628             }
3629             if (roundingMode == float_round_to_odd) {
3630                 /*
3631                  * For round-to-odd case, the roundIncrement depends on
3632                  * zSig which just changed.
3633                  */
3634                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3635             }
3636         }
3637     }
3638     if (roundBits) {
3639         status->float_exception_flags |= float_flag_inexact;
3640     }
3641     zSig = ( zSig + roundIncrement )>>7;
3642     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3643     if ( zSig == 0 ) zExp = 0;
3644     return packFloat32( zSign, zExp, zSig );
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650 | and significand `zSig', and returns the proper single-precision floating-
3651 | point value corresponding to the abstract input.  This routine is just like
3652 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3653 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654 | floating-point exponent.
3655 *----------------------------------------------------------------------------*/
3656 
3657 static float32
3658  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3659                               float_status *status)
3660 {
3661     int8_t shiftCount;
3662 
3663     shiftCount = clz32(zSig) - 1;
3664     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3665                                status);
3666 
3667 }
3668 
3669 /*----------------------------------------------------------------------------
3670 | Normalizes the subnormal double-precision floating-point value represented
3671 | by the denormalized significand `aSig'.  The normalized exponent and
3672 | significand are stored at the locations pointed to by `zExpPtr' and
3673 | `zSigPtr', respectively.
3674 *----------------------------------------------------------------------------*/
3675 
3676 static void
3677  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3678 {
3679     int8_t shiftCount;
3680 
3681     shiftCount = clz64(aSig) - 11;
3682     *zSigPtr = aSig<<shiftCount;
3683     *zExpPtr = 1 - shiftCount;
3684 
3685 }
3686 
3687 /*----------------------------------------------------------------------------
3688 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3689 | double-precision floating-point value, returning the result.  After being
3690 | shifted into the proper positions, the three fields are simply added
3691 | together to form the result.  This means that any integer portion of `zSig'
3692 | will be added into the exponent.  Since a properly normalized significand
3693 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3694 | than the desired result exponent whenever `zSig' is a complete, normalized
3695 | significand.
3696 *----------------------------------------------------------------------------*/
3697 
3698 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3699 {
3700 
3701     return make_float64(
3702         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3703 
3704 }
3705 
3706 /*----------------------------------------------------------------------------
3707 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3708 | and significand `zSig', and returns the proper double-precision floating-
3709 | point value corresponding to the abstract input.  Ordinarily, the abstract
3710 | value is simply rounded and packed into the double-precision format, with
3711 | the inexact exception raised if the abstract input cannot be represented
3712 | exactly.  However, if the abstract value is too large, the overflow and
3713 | inexact exceptions are raised and an infinity or maximal finite value is
3714 | returned.  If the abstract value is too small, the input value is rounded to
3715 | a subnormal number, and the underflow and inexact exceptions are raised if
3716 | the abstract input cannot be represented exactly as a subnormal double-
3717 | precision floating-point number.
3718 |     The input significand `zSig' has its binary point between bits 62
3719 | and 61, which is 10 bits to the left of the usual location.  This shifted
3720 | significand must be normalized or smaller.  If `zSig' is not normalized,
3721 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3722 | and it must not require rounding.  In the usual case that `zSig' is
3723 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3724 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3725 | Binary Floating-Point Arithmetic.
3726 *----------------------------------------------------------------------------*/
3727 
3728 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3729                                    float_status *status)
3730 {
3731     int8_t roundingMode;
3732     flag roundNearestEven;
3733     int roundIncrement, roundBits;
3734     flag isTiny;
3735 
3736     roundingMode = status->float_rounding_mode;
3737     roundNearestEven = ( roundingMode == float_round_nearest_even );
3738     switch (roundingMode) {
3739     case float_round_nearest_even:
3740     case float_round_ties_away:
3741         roundIncrement = 0x200;
3742         break;
3743     case float_round_to_zero:
3744         roundIncrement = 0;
3745         break;
3746     case float_round_up:
3747         roundIncrement = zSign ? 0 : 0x3ff;
3748         break;
3749     case float_round_down:
3750         roundIncrement = zSign ? 0x3ff : 0;
3751         break;
3752     case float_round_to_odd:
3753         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3754         break;
3755     default:
3756         abort();
3757     }
3758     roundBits = zSig & 0x3FF;
3759     if ( 0x7FD <= (uint16_t) zExp ) {
3760         if (    ( 0x7FD < zExp )
3761              || (    ( zExp == 0x7FD )
3762                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3763            ) {
3764             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3765                                    roundIncrement != 0;
3766             float_raise(float_flag_overflow | float_flag_inexact, status);
3767             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3768         }
3769         if ( zExp < 0 ) {
3770             if (status->flush_to_zero) {
3771                 float_raise(float_flag_output_denormal, status);
3772                 return packFloat64(zSign, 0, 0);
3773             }
3774             isTiny =
3775                    (status->float_detect_tininess
3776                     == float_tininess_before_rounding)
3777                 || ( zExp < -1 )
3778                 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3779             shift64RightJamming( zSig, - zExp, &zSig );
3780             zExp = 0;
3781             roundBits = zSig & 0x3FF;
3782             if (isTiny && roundBits) {
3783                 float_raise(float_flag_underflow, status);
3784             }
3785             if (roundingMode == float_round_to_odd) {
3786                 /*
3787                  * For round-to-odd case, the roundIncrement depends on
3788                  * zSig which just changed.
3789                  */
3790                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3791             }
3792         }
3793     }
3794     if (roundBits) {
3795         status->float_exception_flags |= float_flag_inexact;
3796     }
3797     zSig = ( zSig + roundIncrement )>>10;
3798     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3799     if ( zSig == 0 ) zExp = 0;
3800     return packFloat64( zSign, zExp, zSig );
3801 
3802 }
3803 
3804 /*----------------------------------------------------------------------------
3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806 | and significand `zSig', and returns the proper double-precision floating-
3807 | point value corresponding to the abstract input.  This routine is just like
3808 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3809 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3810 | floating-point exponent.
3811 *----------------------------------------------------------------------------*/
3812 
3813 static float64
3814  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3815                               float_status *status)
3816 {
3817     int8_t shiftCount;
3818 
3819     shiftCount = clz64(zSig) - 1;
3820     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3821                                status);
3822 
3823 }
3824 
3825 /*----------------------------------------------------------------------------
3826 | Normalizes the subnormal extended double-precision floating-point value
3827 | represented by the denormalized significand `aSig'.  The normalized exponent
3828 | and significand are stored at the locations pointed to by `zExpPtr' and
3829 | `zSigPtr', respectively.
3830 *----------------------------------------------------------------------------*/
3831 
3832 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3833                                 uint64_t *zSigPtr)
3834 {
3835     int8_t shiftCount;
3836 
3837     shiftCount = clz64(aSig);
3838     *zSigPtr = aSig<<shiftCount;
3839     *zExpPtr = 1 - shiftCount;
3840 }
3841 
3842 /*----------------------------------------------------------------------------
3843 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3844 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3845 | and returns the proper extended double-precision floating-point value
3846 | corresponding to the abstract input.  Ordinarily, the abstract value is
3847 | rounded and packed into the extended double-precision format, with the
3848 | inexact exception raised if the abstract input cannot be represented
3849 | exactly.  However, if the abstract value is too large, the overflow and
3850 | inexact exceptions are raised and an infinity or maximal finite value is
3851 | returned.  If the abstract value is too small, the input value is rounded to
3852 | a subnormal number, and the underflow and inexact exceptions are raised if
3853 | the abstract input cannot be represented exactly as a subnormal extended
3854 | double-precision floating-point number.
3855 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3856 | number of bits as single or double precision, respectively.  Otherwise, the
3857 | result is rounded to the full precision of the extended double-precision
3858 | format.
3859 |     The input significand must be normalized or smaller.  If the input
3860 | significand is not normalized, `zExp' must be 0; in that case, the result
3861 | returned is a subnormal number, and it must not require rounding.  The
3862 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3863 | Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3865 
3866 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3867                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3868                               float_status *status)
3869 {
3870     int8_t roundingMode;
3871     flag roundNearestEven, increment, isTiny;
3872     int64_t roundIncrement, roundMask, roundBits;
3873 
3874     roundingMode = status->float_rounding_mode;
3875     roundNearestEven = ( roundingMode == float_round_nearest_even );
3876     if ( roundingPrecision == 80 ) goto precision80;
3877     if ( roundingPrecision == 64 ) {
3878         roundIncrement = UINT64_C(0x0000000000000400);
3879         roundMask = UINT64_C(0x00000000000007FF);
3880     }
3881     else if ( roundingPrecision == 32 ) {
3882         roundIncrement = UINT64_C(0x0000008000000000);
3883         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3884     }
3885     else {
3886         goto precision80;
3887     }
3888     zSig0 |= ( zSig1 != 0 );
3889     switch (roundingMode) {
3890     case float_round_nearest_even:
3891     case float_round_ties_away:
3892         break;
3893     case float_round_to_zero:
3894         roundIncrement = 0;
3895         break;
3896     case float_round_up:
3897         roundIncrement = zSign ? 0 : roundMask;
3898         break;
3899     case float_round_down:
3900         roundIncrement = zSign ? roundMask : 0;
3901         break;
3902     default:
3903         abort();
3904     }
3905     roundBits = zSig0 & roundMask;
3906     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3907         if (    ( 0x7FFE < zExp )
3908              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3909            ) {
3910             goto overflow;
3911         }
3912         if ( zExp <= 0 ) {
3913             if (status->flush_to_zero) {
3914                 float_raise(float_flag_output_denormal, status);
3915                 return packFloatx80(zSign, 0, 0);
3916             }
3917             isTiny =
3918                    (status->float_detect_tininess
3919                     == float_tininess_before_rounding)
3920                 || ( zExp < 0 )
3921                 || ( zSig0 <= zSig0 + roundIncrement );
3922             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3923             zExp = 0;
3924             roundBits = zSig0 & roundMask;
3925             if (isTiny && roundBits) {
3926                 float_raise(float_flag_underflow, status);
3927             }
3928             if (roundBits) {
3929                 status->float_exception_flags |= float_flag_inexact;
3930             }
3931             zSig0 += roundIncrement;
3932             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3933             roundIncrement = roundMask + 1;
3934             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935                 roundMask |= roundIncrement;
3936             }
3937             zSig0 &= ~ roundMask;
3938             return packFloatx80( zSign, zExp, zSig0 );
3939         }
3940     }
3941     if (roundBits) {
3942         status->float_exception_flags |= float_flag_inexact;
3943     }
3944     zSig0 += roundIncrement;
3945     if ( zSig0 < roundIncrement ) {
3946         ++zExp;
3947         zSig0 = UINT64_C(0x8000000000000000);
3948     }
3949     roundIncrement = roundMask + 1;
3950     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3951         roundMask |= roundIncrement;
3952     }
3953     zSig0 &= ~ roundMask;
3954     if ( zSig0 == 0 ) zExp = 0;
3955     return packFloatx80( zSign, zExp, zSig0 );
3956  precision80:
3957     switch (roundingMode) {
3958     case float_round_nearest_even:
3959     case float_round_ties_away:
3960         increment = ((int64_t)zSig1 < 0);
3961         break;
3962     case float_round_to_zero:
3963         increment = 0;
3964         break;
3965     case float_round_up:
3966         increment = !zSign && zSig1;
3967         break;
3968     case float_round_down:
3969         increment = zSign && zSig1;
3970         break;
3971     default:
3972         abort();
3973     }
3974     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3975         if (    ( 0x7FFE < zExp )
3976              || (    ( zExp == 0x7FFE )
3977                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3978                   && increment
3979                 )
3980            ) {
3981             roundMask = 0;
3982  overflow:
3983             float_raise(float_flag_overflow | float_flag_inexact, status);
3984             if (    ( roundingMode == float_round_to_zero )
3985                  || ( zSign && ( roundingMode == float_round_up ) )
3986                  || ( ! zSign && ( roundingMode == float_round_down ) )
3987                ) {
3988                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3989             }
3990             return packFloatx80(zSign,
3991                                 floatx80_infinity_high,
3992                                 floatx80_infinity_low);
3993         }
3994         if ( zExp <= 0 ) {
3995             isTiny =
3996                    (status->float_detect_tininess
3997                     == float_tininess_before_rounding)
3998                 || ( zExp < 0 )
3999                 || ! increment
4000                 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
4001             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4002             zExp = 0;
4003             if (isTiny && zSig1) {
4004                 float_raise(float_flag_underflow, status);
4005             }
4006             if (zSig1) {
4007                 status->float_exception_flags |= float_flag_inexact;
4008             }
4009             switch (roundingMode) {
4010             case float_round_nearest_even:
4011             case float_round_ties_away:
4012                 increment = ((int64_t)zSig1 < 0);
4013                 break;
4014             case float_round_to_zero:
4015                 increment = 0;
4016                 break;
4017             case float_round_up:
4018                 increment = !zSign && zSig1;
4019                 break;
4020             case float_round_down:
4021                 increment = zSign && zSig1;
4022                 break;
4023             default:
4024                 abort();
4025             }
4026             if ( increment ) {
4027                 ++zSig0;
4028                 zSig0 &=
4029                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4031             }
4032             return packFloatx80( zSign, zExp, zSig0 );
4033         }
4034     }
4035     if (zSig1) {
4036         status->float_exception_flags |= float_flag_inexact;
4037     }
4038     if ( increment ) {
4039         ++zSig0;
4040         if ( zSig0 == 0 ) {
4041             ++zExp;
4042             zSig0 = UINT64_C(0x8000000000000000);
4043         }
4044         else {
4045             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4046         }
4047     }
4048     else {
4049         if ( zSig0 == 0 ) zExp = 0;
4050     }
4051     return packFloatx80( zSign, zExp, zSig0 );
4052 
4053 }
4054 
4055 /*----------------------------------------------------------------------------
4056 | Takes an abstract floating-point value having sign `zSign', exponent
4057 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4058 | and returns the proper extended double-precision floating-point value
4059 | corresponding to the abstract input.  This routine is just like
4060 | `roundAndPackFloatx80' except that the input significand does not have to be
4061 | normalized.
4062 *----------------------------------------------------------------------------*/
4063 
4064 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4065                                        flag zSign, int32_t zExp,
4066                                        uint64_t zSig0, uint64_t zSig1,
4067                                        float_status *status)
4068 {
4069     int8_t shiftCount;
4070 
4071     if ( zSig0 == 0 ) {
4072         zSig0 = zSig1;
4073         zSig1 = 0;
4074         zExp -= 64;
4075     }
4076     shiftCount = clz64(zSig0);
4077     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4078     zExp -= shiftCount;
4079     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4080                                 zSig0, zSig1, status);
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Returns the least-significant 64 fraction bits of the quadruple-precision
4086 | floating-point value `a'.
4087 *----------------------------------------------------------------------------*/
4088 
4089 static inline uint64_t extractFloat128Frac1( float128 a )
4090 {
4091 
4092     return a.low;
4093 
4094 }
4095 
4096 /*----------------------------------------------------------------------------
4097 | Returns the most-significant 48 fraction bits of the quadruple-precision
4098 | floating-point value `a'.
4099 *----------------------------------------------------------------------------*/
4100 
4101 static inline uint64_t extractFloat128Frac0( float128 a )
4102 {
4103 
4104     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4105 
4106 }
4107 
4108 /*----------------------------------------------------------------------------
4109 | Returns the exponent bits of the quadruple-precision floating-point value
4110 | `a'.
4111 *----------------------------------------------------------------------------*/
4112 
4113 static inline int32_t extractFloat128Exp( float128 a )
4114 {
4115 
4116     return ( a.high>>48 ) & 0x7FFF;
4117 
4118 }
4119 
4120 /*----------------------------------------------------------------------------
4121 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4122 *----------------------------------------------------------------------------*/
4123 
4124 static inline flag extractFloat128Sign( float128 a )
4125 {
4126 
4127     return a.high>>63;
4128 
4129 }
4130 
4131 /*----------------------------------------------------------------------------
4132 | Normalizes the subnormal quadruple-precision floating-point value
4133 | represented by the denormalized significand formed by the concatenation of
4134 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4135 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4136 | significand are stored at the location pointed to by `zSig0Ptr', and the
4137 | least significant 64 bits of the normalized significand are stored at the
4138 | location pointed to by `zSig1Ptr'.
4139 *----------------------------------------------------------------------------*/
4140 
4141 static void
4142  normalizeFloat128Subnormal(
4143      uint64_t aSig0,
4144      uint64_t aSig1,
4145      int32_t *zExpPtr,
4146      uint64_t *zSig0Ptr,
4147      uint64_t *zSig1Ptr
4148  )
4149 {
4150     int8_t shiftCount;
4151 
4152     if ( aSig0 == 0 ) {
4153         shiftCount = clz64(aSig1) - 15;
4154         if ( shiftCount < 0 ) {
4155             *zSig0Ptr = aSig1>>( - shiftCount );
4156             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4157         }
4158         else {
4159             *zSig0Ptr = aSig1<<shiftCount;
4160             *zSig1Ptr = 0;
4161         }
4162         *zExpPtr = - shiftCount - 63;
4163     }
4164     else {
4165         shiftCount = clz64(aSig0) - 15;
4166         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4167         *zExpPtr = 1 - shiftCount;
4168     }
4169 
4170 }
4171 
4172 /*----------------------------------------------------------------------------
4173 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4174 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4175 | floating-point value, returning the result.  After being shifted into the
4176 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4177 | added together to form the most significant 32 bits of the result.  This
4178 | means that any integer portion of `zSig0' will be added into the exponent.
4179 | Since a properly normalized significand will have an integer portion equal
4180 | to 1, the `zExp' input should be 1 less than the desired result exponent
4181 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4182 | significand.
4183 *----------------------------------------------------------------------------*/
4184 
4185 static inline float128
4186  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4187 {
4188     float128 z;
4189 
4190     z.low = zSig1;
4191     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4192     return z;
4193 
4194 }
4195 
4196 /*----------------------------------------------------------------------------
4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4198 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4199 | and `zSig2', and returns the proper quadruple-precision floating-point value
4200 | corresponding to the abstract input.  Ordinarily, the abstract value is
4201 | simply rounded and packed into the quadruple-precision format, with the
4202 | inexact exception raised if the abstract input cannot be represented
4203 | exactly.  However, if the abstract value is too large, the overflow and
4204 | inexact exceptions are raised and an infinity or maximal finite value is
4205 | returned.  If the abstract value is too small, the input value is rounded to
4206 | a subnormal number, and the underflow and inexact exceptions are raised if
4207 | the abstract input cannot be represented exactly as a subnormal quadruple-
4208 | precision floating-point number.
4209 |     The input significand must be normalized or smaller.  If the input
4210 | significand is not normalized, `zExp' must be 0; in that case, the result
4211 | returned is a subnormal number, and it must not require rounding.  In the
4212 | usual case that the input significand is normalized, `zExp' must be 1 less
4213 | than the ``true'' floating-point exponent.  The handling of underflow and
4214 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4215 *----------------------------------------------------------------------------*/
4216 
4217 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4218                                      uint64_t zSig0, uint64_t zSig1,
4219                                      uint64_t zSig2, float_status *status)
4220 {
4221     int8_t roundingMode;
4222     flag roundNearestEven, increment, isTiny;
4223 
4224     roundingMode = status->float_rounding_mode;
4225     roundNearestEven = ( roundingMode == float_round_nearest_even );
4226     switch (roundingMode) {
4227     case float_round_nearest_even:
4228     case float_round_ties_away:
4229         increment = ((int64_t)zSig2 < 0);
4230         break;
4231     case float_round_to_zero:
4232         increment = 0;
4233         break;
4234     case float_round_up:
4235         increment = !zSign && zSig2;
4236         break;
4237     case float_round_down:
4238         increment = zSign && zSig2;
4239         break;
4240     case float_round_to_odd:
4241         increment = !(zSig1 & 0x1) && zSig2;
4242         break;
4243     default:
4244         abort();
4245     }
4246     if ( 0x7FFD <= (uint32_t) zExp ) {
4247         if (    ( 0x7FFD < zExp )
4248              || (    ( zExp == 0x7FFD )
4249                   && eq128(
4250                          UINT64_C(0x0001FFFFFFFFFFFF),
4251                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4252                          zSig0,
4253                          zSig1
4254                      )
4255                   && increment
4256                 )
4257            ) {
4258             float_raise(float_flag_overflow | float_flag_inexact, status);
4259             if (    ( roundingMode == float_round_to_zero )
4260                  || ( zSign && ( roundingMode == float_round_up ) )
4261                  || ( ! zSign && ( roundingMode == float_round_down ) )
4262                  || (roundingMode == float_round_to_odd)
4263                ) {
4264                 return
4265                     packFloat128(
4266                         zSign,
4267                         0x7FFE,
4268                         UINT64_C(0x0000FFFFFFFFFFFF),
4269                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4270                     );
4271             }
4272             return packFloat128( zSign, 0x7FFF, 0, 0 );
4273         }
4274         if ( zExp < 0 ) {
4275             if (status->flush_to_zero) {
4276                 float_raise(float_flag_output_denormal, status);
4277                 return packFloat128(zSign, 0, 0, 0);
4278             }
4279             isTiny =
4280                    (status->float_detect_tininess
4281                     == float_tininess_before_rounding)
4282                 || ( zExp < -1 )
4283                 || ! increment
4284                 || lt128(
4285                        zSig0,
4286                        zSig1,
4287                        UINT64_C(0x0001FFFFFFFFFFFF),
4288                        UINT64_C(0xFFFFFFFFFFFFFFFF)
4289                    );
4290             shift128ExtraRightJamming(
4291                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4292             zExp = 0;
4293             if (isTiny && zSig2) {
4294                 float_raise(float_flag_underflow, status);
4295             }
4296             switch (roundingMode) {
4297             case float_round_nearest_even:
4298             case float_round_ties_away:
4299                 increment = ((int64_t)zSig2 < 0);
4300                 break;
4301             case float_round_to_zero:
4302                 increment = 0;
4303                 break;
4304             case float_round_up:
4305                 increment = !zSign && zSig2;
4306                 break;
4307             case float_round_down:
4308                 increment = zSign && zSig2;
4309                 break;
4310             case float_round_to_odd:
4311                 increment = !(zSig1 & 0x1) && zSig2;
4312                 break;
4313             default:
4314                 abort();
4315             }
4316         }
4317     }
4318     if (zSig2) {
4319         status->float_exception_flags |= float_flag_inexact;
4320     }
4321     if ( increment ) {
4322         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4323         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4324     }
4325     else {
4326         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4327     }
4328     return packFloat128( zSign, zExp, zSig0, zSig1 );
4329 
4330 }
4331 
4332 /*----------------------------------------------------------------------------
4333 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4334 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4335 | returns the proper quadruple-precision floating-point value corresponding
4336 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4337 | except that the input significand has fewer bits and does not have to be
4338 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4339 | point exponent.
4340 *----------------------------------------------------------------------------*/
4341 
4342 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4343                                               uint64_t zSig0, uint64_t zSig1,
4344                                               float_status *status)
4345 {
4346     int8_t shiftCount;
4347     uint64_t zSig2;
4348 
4349     if ( zSig0 == 0 ) {
4350         zSig0 = zSig1;
4351         zSig1 = 0;
4352         zExp -= 64;
4353     }
4354     shiftCount = clz64(zSig0) - 15;
4355     if ( 0 <= shiftCount ) {
4356         zSig2 = 0;
4357         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4358     }
4359     else {
4360         shift128ExtraRightJamming(
4361             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4362     }
4363     zExp -= shiftCount;
4364     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4365 
4366 }
4367 
4368 
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of converting the 32-bit two's complement integer `a'
4371 | to the extended double-precision floating-point format.  The conversion
4372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4373 | Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375 
4376 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4377 {
4378     flag zSign;
4379     uint32_t absA;
4380     int8_t shiftCount;
4381     uint64_t zSig;
4382 
4383     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4384     zSign = ( a < 0 );
4385     absA = zSign ? - a : a;
4386     shiftCount = clz32(absA) + 32;
4387     zSig = absA;
4388     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns the result of converting the 32-bit two's complement integer `a' to
4394 | the quadruple-precision floating-point format.  The conversion is performed
4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4397 
4398 float128 int32_to_float128(int32_t a, float_status *status)
4399 {
4400     flag zSign;
4401     uint32_t absA;
4402     int8_t shiftCount;
4403     uint64_t zSig0;
4404 
4405     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4406     zSign = ( a < 0 );
4407     absA = zSign ? - a : a;
4408     shiftCount = clz32(absA) + 17;
4409     zSig0 = absA;
4410     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4411 
4412 }
4413 
4414 /*----------------------------------------------------------------------------
4415 | Returns the result of converting the 64-bit two's complement integer `a'
4416 | to the extended double-precision floating-point format.  The conversion
4417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4418 | Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420 
4421 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4422 {
4423     flag zSign;
4424     uint64_t absA;
4425     int8_t shiftCount;
4426 
4427     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4428     zSign = ( a < 0 );
4429     absA = zSign ? - a : a;
4430     shiftCount = clz64(absA);
4431     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4432 
4433 }
4434 
4435 /*----------------------------------------------------------------------------
4436 | Returns the result of converting the 64-bit two's complement integer `a' to
4437 | the quadruple-precision floating-point format.  The conversion is performed
4438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4439 *----------------------------------------------------------------------------*/
4440 
4441 float128 int64_to_float128(int64_t a, float_status *status)
4442 {
4443     flag zSign;
4444     uint64_t absA;
4445     int8_t shiftCount;
4446     int32_t zExp;
4447     uint64_t zSig0, zSig1;
4448 
4449     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4450     zSign = ( a < 0 );
4451     absA = zSign ? - a : a;
4452     shiftCount = clz64(absA) + 49;
4453     zExp = 0x406E - shiftCount;
4454     if ( 64 <= shiftCount ) {
4455         zSig1 = 0;
4456         zSig0 = absA;
4457         shiftCount -= 64;
4458     }
4459     else {
4460         zSig1 = absA;
4461         zSig0 = 0;
4462     }
4463     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4464     return packFloat128( zSign, zExp, zSig0, zSig1 );
4465 
4466 }
4467 
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the 64-bit unsigned integer `a'
4470 | to the quadruple-precision floating-point format.  The conversion is performed
4471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4472 *----------------------------------------------------------------------------*/
4473 
4474 float128 uint64_to_float128(uint64_t a, float_status *status)
4475 {
4476     if (a == 0) {
4477         return float128_zero;
4478     }
4479     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4480 }
4481 
4482 /*----------------------------------------------------------------------------
4483 | Returns the result of converting the single-precision floating-point value
4484 | `a' to the extended double-precision floating-point format.  The conversion
4485 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486 | Arithmetic.
4487 *----------------------------------------------------------------------------*/
4488 
4489 floatx80 float32_to_floatx80(float32 a, float_status *status)
4490 {
4491     flag aSign;
4492     int aExp;
4493     uint32_t aSig;
4494 
4495     a = float32_squash_input_denormal(a, status);
4496     aSig = extractFloat32Frac( a );
4497     aExp = extractFloat32Exp( a );
4498     aSign = extractFloat32Sign( a );
4499     if ( aExp == 0xFF ) {
4500         if (aSig) {
4501             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4502         }
4503         return packFloatx80(aSign,
4504                             floatx80_infinity_high,
4505                             floatx80_infinity_low);
4506     }
4507     if ( aExp == 0 ) {
4508         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4509         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4510     }
4511     aSig |= 0x00800000;
4512     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4513 
4514 }
4515 
4516 /*----------------------------------------------------------------------------
4517 | Returns the result of converting the single-precision floating-point value
4518 | `a' to the double-precision floating-point format.  The conversion is
4519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4520 | Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 
4523 float128 float32_to_float128(float32 a, float_status *status)
4524 {
4525     flag aSign;
4526     int aExp;
4527     uint32_t aSig;
4528 
4529     a = float32_squash_input_denormal(a, status);
4530     aSig = extractFloat32Frac( a );
4531     aExp = extractFloat32Exp( a );
4532     aSign = extractFloat32Sign( a );
4533     if ( aExp == 0xFF ) {
4534         if (aSig) {
4535             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4536         }
4537         return packFloat128( aSign, 0x7FFF, 0, 0 );
4538     }
4539     if ( aExp == 0 ) {
4540         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4541         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4542         --aExp;
4543     }
4544     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4545 
4546 }
4547 
4548 /*----------------------------------------------------------------------------
4549 | Returns the remainder of the single-precision floating-point value `a'
4550 | with respect to the corresponding value `b'.  The operation is performed
4551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4552 *----------------------------------------------------------------------------*/
4553 
4554 float32 float32_rem(float32 a, float32 b, float_status *status)
4555 {
4556     flag aSign, zSign;
4557     int aExp, bExp, expDiff;
4558     uint32_t aSig, bSig;
4559     uint32_t q;
4560     uint64_t aSig64, bSig64, q64;
4561     uint32_t alternateASig;
4562     int32_t sigMean;
4563     a = float32_squash_input_denormal(a, status);
4564     b = float32_squash_input_denormal(b, status);
4565 
4566     aSig = extractFloat32Frac( a );
4567     aExp = extractFloat32Exp( a );
4568     aSign = extractFloat32Sign( a );
4569     bSig = extractFloat32Frac( b );
4570     bExp = extractFloat32Exp( b );
4571     if ( aExp == 0xFF ) {
4572         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4573             return propagateFloat32NaN(a, b, status);
4574         }
4575         float_raise(float_flag_invalid, status);
4576         return float32_default_nan(status);
4577     }
4578     if ( bExp == 0xFF ) {
4579         if (bSig) {
4580             return propagateFloat32NaN(a, b, status);
4581         }
4582         return a;
4583     }
4584     if ( bExp == 0 ) {
4585         if ( bSig == 0 ) {
4586             float_raise(float_flag_invalid, status);
4587             return float32_default_nan(status);
4588         }
4589         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4590     }
4591     if ( aExp == 0 ) {
4592         if ( aSig == 0 ) return a;
4593         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4594     }
4595     expDiff = aExp - bExp;
4596     aSig |= 0x00800000;
4597     bSig |= 0x00800000;
4598     if ( expDiff < 32 ) {
4599         aSig <<= 8;
4600         bSig <<= 8;
4601         if ( expDiff < 0 ) {
4602             if ( expDiff < -1 ) return a;
4603             aSig >>= 1;
4604         }
4605         q = ( bSig <= aSig );
4606         if ( q ) aSig -= bSig;
4607         if ( 0 < expDiff ) {
4608             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4609             q >>= 32 - expDiff;
4610             bSig >>= 2;
4611             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4612         }
4613         else {
4614             aSig >>= 2;
4615             bSig >>= 2;
4616         }
4617     }
4618     else {
4619         if ( bSig <= aSig ) aSig -= bSig;
4620         aSig64 = ( (uint64_t) aSig )<<40;
4621         bSig64 = ( (uint64_t) bSig )<<40;
4622         expDiff -= 64;
4623         while ( 0 < expDiff ) {
4624             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4625             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4626             aSig64 = - ( ( bSig * q64 )<<38 );
4627             expDiff -= 62;
4628         }
4629         expDiff += 64;
4630         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4631         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4632         q = q64>>( 64 - expDiff );
4633         bSig <<= 6;
4634         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4635     }
4636     do {
4637         alternateASig = aSig;
4638         ++q;
4639         aSig -= bSig;
4640     } while ( 0 <= (int32_t) aSig );
4641     sigMean = aSig + alternateASig;
4642     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4643         aSig = alternateASig;
4644     }
4645     zSign = ( (int32_t) aSig < 0 );
4646     if ( zSign ) aSig = - aSig;
4647     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4648 }
4649 
4650 
4651 
4652 /*----------------------------------------------------------------------------
4653 | Returns the binary exponential of the single-precision floating-point value
4654 | `a'. The operation is performed according to the IEC/IEEE Standard for
4655 | Binary Floating-Point Arithmetic.
4656 |
4657 | Uses the following identities:
4658 |
4659 | 1. -------------------------------------------------------------------------
4660 |      x    x*ln(2)
4661 |     2  = e
4662 |
4663 | 2. -------------------------------------------------------------------------
4664 |                      2     3     4     5           n
4665 |      x        x     x     x     x     x           x
4666 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4667 |               1!    2!    3!    4!    5!          n!
4668 *----------------------------------------------------------------------------*/
4669 
4670 static const float64 float32_exp2_coefficients[15] =
4671 {
4672     const_float64( 0x3ff0000000000000ll ), /*  1 */
4673     const_float64( 0x3fe0000000000000ll ), /*  2 */
4674     const_float64( 0x3fc5555555555555ll ), /*  3 */
4675     const_float64( 0x3fa5555555555555ll ), /*  4 */
4676     const_float64( 0x3f81111111111111ll ), /*  5 */
4677     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4678     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4679     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4680     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4681     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4682     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4683     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4684     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4685     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4686     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4687 };
4688 
4689 float32 float32_exp2(float32 a, float_status *status)
4690 {
4691     flag aSign;
4692     int aExp;
4693     uint32_t aSig;
4694     float64 r, x, xn;
4695     int i;
4696     a = float32_squash_input_denormal(a, status);
4697 
4698     aSig = extractFloat32Frac( a );
4699     aExp = extractFloat32Exp( a );
4700     aSign = extractFloat32Sign( a );
4701 
4702     if ( aExp == 0xFF) {
4703         if (aSig) {
4704             return propagateFloat32NaN(a, float32_zero, status);
4705         }
4706         return (aSign) ? float32_zero : a;
4707     }
4708     if (aExp == 0) {
4709         if (aSig == 0) return float32_one;
4710     }
4711 
4712     float_raise(float_flag_inexact, status);
4713 
4714     /* ******************************* */
4715     /* using float64 for approximation */
4716     /* ******************************* */
4717     x = float32_to_float64(a, status);
4718     x = float64_mul(x, float64_ln2, status);
4719 
4720     xn = x;
4721     r = float64_one;
4722     for (i = 0 ; i < 15 ; i++) {
4723         float64 f;
4724 
4725         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4726         r = float64_add(r, f, status);
4727 
4728         xn = float64_mul(xn, x, status);
4729     }
4730 
4731     return float64_to_float32(r, status);
4732 }
4733 
4734 /*----------------------------------------------------------------------------
4735 | Returns the binary log of the single-precision floating-point value `a'.
4736 | The operation is performed according to the IEC/IEEE Standard for Binary
4737 | Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739 float32 float32_log2(float32 a, float_status *status)
4740 {
4741     flag aSign, zSign;
4742     int aExp;
4743     uint32_t aSig, zSig, i;
4744 
4745     a = float32_squash_input_denormal(a, status);
4746     aSig = extractFloat32Frac( a );
4747     aExp = extractFloat32Exp( a );
4748     aSign = extractFloat32Sign( a );
4749 
4750     if ( aExp == 0 ) {
4751         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4752         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4753     }
4754     if ( aSign ) {
4755         float_raise(float_flag_invalid, status);
4756         return float32_default_nan(status);
4757     }
4758     if ( aExp == 0xFF ) {
4759         if (aSig) {
4760             return propagateFloat32NaN(a, float32_zero, status);
4761         }
4762         return a;
4763     }
4764 
4765     aExp -= 0x7F;
4766     aSig |= 0x00800000;
4767     zSign = aExp < 0;
4768     zSig = aExp << 23;
4769 
4770     for (i = 1 << 22; i > 0; i >>= 1) {
4771         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4772         if ( aSig & 0x01000000 ) {
4773             aSig >>= 1;
4774             zSig |= i;
4775         }
4776     }
4777 
4778     if ( zSign )
4779         zSig = -zSig;
4780 
4781     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4782 }
4783 
4784 /*----------------------------------------------------------------------------
4785 | Returns 1 if the single-precision floating-point value `a' is equal to
4786 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4787 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4788 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4789 *----------------------------------------------------------------------------*/
4790 
4791 int float32_eq(float32 a, float32 b, float_status *status)
4792 {
4793     uint32_t av, bv;
4794     a = float32_squash_input_denormal(a, status);
4795     b = float32_squash_input_denormal(b, status);
4796 
4797     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4798          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4799        ) {
4800         float_raise(float_flag_invalid, status);
4801         return 0;
4802     }
4803     av = float32_val(a);
4804     bv = float32_val(b);
4805     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4806 }
4807 
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the single-precision floating-point value `a' is less than
4810 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4811 | exception is raised if either operand is a NaN.  The comparison is performed
4812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814 
4815 int float32_le(float32 a, float32 b, float_status *status)
4816 {
4817     flag aSign, bSign;
4818     uint32_t av, bv;
4819     a = float32_squash_input_denormal(a, status);
4820     b = float32_squash_input_denormal(b, status);
4821 
4822     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4823          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4824        ) {
4825         float_raise(float_flag_invalid, status);
4826         return 0;
4827     }
4828     aSign = extractFloat32Sign( a );
4829     bSign = extractFloat32Sign( b );
4830     av = float32_val(a);
4831     bv = float32_val(b);
4832     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4833     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4834 
4835 }
4836 
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than
4839 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4840 | raised if either operand is a NaN.  The comparison is performed according
4841 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4843 
4844 int float32_lt(float32 a, float32 b, float_status *status)
4845 {
4846     flag aSign, bSign;
4847     uint32_t av, bv;
4848     a = float32_squash_input_denormal(a, status);
4849     b = float32_squash_input_denormal(b, status);
4850 
4851     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4852          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4853        ) {
4854         float_raise(float_flag_invalid, status);
4855         return 0;
4856     }
4857     aSign = extractFloat32Sign( a );
4858     bSign = extractFloat32Sign( b );
4859     av = float32_val(a);
4860     bv = float32_val(b);
4861     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4862     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4863 
4864 }
4865 
4866 /*----------------------------------------------------------------------------
4867 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4868 | be compared, and 0 otherwise.  The invalid exception is raised if either
4869 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4870 | Standard for Binary Floating-Point Arithmetic.
4871 *----------------------------------------------------------------------------*/
4872 
4873 int float32_unordered(float32 a, float32 b, float_status *status)
4874 {
4875     a = float32_squash_input_denormal(a, status);
4876     b = float32_squash_input_denormal(b, status);
4877 
4878     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4879          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4880        ) {
4881         float_raise(float_flag_invalid, status);
4882         return 1;
4883     }
4884     return 0;
4885 }
4886 
4887 /*----------------------------------------------------------------------------
4888 | Returns 1 if the single-precision floating-point value `a' is equal to
4889 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4890 | exception.  The comparison is performed according to the IEC/IEEE Standard
4891 | for Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4893 
4894 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4895 {
4896     a = float32_squash_input_denormal(a, status);
4897     b = float32_squash_input_denormal(b, status);
4898 
4899     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4900          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4901        ) {
4902         if (float32_is_signaling_nan(a, status)
4903          || float32_is_signaling_nan(b, status)) {
4904             float_raise(float_flag_invalid, status);
4905         }
4906         return 0;
4907     }
4908     return ( float32_val(a) == float32_val(b) ) ||
4909             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4910 }
4911 
4912 /*----------------------------------------------------------------------------
4913 | Returns 1 if the single-precision floating-point value `a' is less than or
4914 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4915 | cause an exception.  Otherwise, the comparison is performed according to the
4916 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4917 *----------------------------------------------------------------------------*/
4918 
4919 int float32_le_quiet(float32 a, float32 b, float_status *status)
4920 {
4921     flag aSign, bSign;
4922     uint32_t av, bv;
4923     a = float32_squash_input_denormal(a, status);
4924     b = float32_squash_input_denormal(b, status);
4925 
4926     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4927          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4928        ) {
4929         if (float32_is_signaling_nan(a, status)
4930          || float32_is_signaling_nan(b, status)) {
4931             float_raise(float_flag_invalid, status);
4932         }
4933         return 0;
4934     }
4935     aSign = extractFloat32Sign( a );
4936     bSign = extractFloat32Sign( b );
4937     av = float32_val(a);
4938     bv = float32_val(b);
4939     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4940     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4941 
4942 }
4943 
4944 /*----------------------------------------------------------------------------
4945 | Returns 1 if the single-precision floating-point value `a' is less than
4946 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4947 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4948 | Standard for Binary Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4950 
4951 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4952 {
4953     flag aSign, bSign;
4954     uint32_t av, bv;
4955     a = float32_squash_input_denormal(a, status);
4956     b = float32_squash_input_denormal(b, status);
4957 
4958     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4959          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4960        ) {
4961         if (float32_is_signaling_nan(a, status)
4962          || float32_is_signaling_nan(b, status)) {
4963             float_raise(float_flag_invalid, status);
4964         }
4965         return 0;
4966     }
4967     aSign = extractFloat32Sign( a );
4968     bSign = extractFloat32Sign( b );
4969     av = float32_val(a);
4970     bv = float32_val(b);
4971     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4972     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4973 
4974 }
4975 
4976 /*----------------------------------------------------------------------------
4977 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4978 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4979 | comparison is performed according to the IEC/IEEE Standard for Binary
4980 | Floating-Point Arithmetic.
4981 *----------------------------------------------------------------------------*/
4982 
4983 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4984 {
4985     a = float32_squash_input_denormal(a, status);
4986     b = float32_squash_input_denormal(b, status);
4987 
4988     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4989          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4990        ) {
4991         if (float32_is_signaling_nan(a, status)
4992          || float32_is_signaling_nan(b, status)) {
4993             float_raise(float_flag_invalid, status);
4994         }
4995         return 1;
4996     }
4997     return 0;
4998 }
4999 
5000 /*----------------------------------------------------------------------------
5001 | Returns the result of converting the double-precision floating-point value
5002 | `a' to the extended double-precision floating-point format.  The conversion
5003 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5004 | Arithmetic.
5005 *----------------------------------------------------------------------------*/
5006 
5007 floatx80 float64_to_floatx80(float64 a, float_status *status)
5008 {
5009     flag aSign;
5010     int aExp;
5011     uint64_t aSig;
5012 
5013     a = float64_squash_input_denormal(a, status);
5014     aSig = extractFloat64Frac( a );
5015     aExp = extractFloat64Exp( a );
5016     aSign = extractFloat64Sign( a );
5017     if ( aExp == 0x7FF ) {
5018         if (aSig) {
5019             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5020         }
5021         return packFloatx80(aSign,
5022                             floatx80_infinity_high,
5023                             floatx80_infinity_low);
5024     }
5025     if ( aExp == 0 ) {
5026         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5027         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5028     }
5029     return
5030         packFloatx80(
5031             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5032 
5033 }
5034 
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the double-precision floating-point value
5037 | `a' to the quadruple-precision floating-point format.  The conversion is
5038 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5039 | Arithmetic.
5040 *----------------------------------------------------------------------------*/
5041 
5042 float128 float64_to_float128(float64 a, float_status *status)
5043 {
5044     flag aSign;
5045     int aExp;
5046     uint64_t aSig, zSig0, zSig1;
5047 
5048     a = float64_squash_input_denormal(a, status);
5049     aSig = extractFloat64Frac( a );
5050     aExp = extractFloat64Exp( a );
5051     aSign = extractFloat64Sign( a );
5052     if ( aExp == 0x7FF ) {
5053         if (aSig) {
5054             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5055         }
5056         return packFloat128( aSign, 0x7FFF, 0, 0 );
5057     }
5058     if ( aExp == 0 ) {
5059         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5060         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5061         --aExp;
5062     }
5063     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5064     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5065 
5066 }
5067 
5068 
5069 /*----------------------------------------------------------------------------
5070 | Returns the remainder of the double-precision floating-point value `a'
5071 | with respect to the corresponding value `b'.  The operation is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5074 
5075 float64 float64_rem(float64 a, float64 b, float_status *status)
5076 {
5077     flag aSign, zSign;
5078     int aExp, bExp, expDiff;
5079     uint64_t aSig, bSig;
5080     uint64_t q, alternateASig;
5081     int64_t sigMean;
5082 
5083     a = float64_squash_input_denormal(a, status);
5084     b = float64_squash_input_denormal(b, status);
5085     aSig = extractFloat64Frac( a );
5086     aExp = extractFloat64Exp( a );
5087     aSign = extractFloat64Sign( a );
5088     bSig = extractFloat64Frac( b );
5089     bExp = extractFloat64Exp( b );
5090     if ( aExp == 0x7FF ) {
5091         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5092             return propagateFloat64NaN(a, b, status);
5093         }
5094         float_raise(float_flag_invalid, status);
5095         return float64_default_nan(status);
5096     }
5097     if ( bExp == 0x7FF ) {
5098         if (bSig) {
5099             return propagateFloat64NaN(a, b, status);
5100         }
5101         return a;
5102     }
5103     if ( bExp == 0 ) {
5104         if ( bSig == 0 ) {
5105             float_raise(float_flag_invalid, status);
5106             return float64_default_nan(status);
5107         }
5108         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5109     }
5110     if ( aExp == 0 ) {
5111         if ( aSig == 0 ) return a;
5112         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5113     }
5114     expDiff = aExp - bExp;
5115     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5116     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5117     if ( expDiff < 0 ) {
5118         if ( expDiff < -1 ) return a;
5119         aSig >>= 1;
5120     }
5121     q = ( bSig <= aSig );
5122     if ( q ) aSig -= bSig;
5123     expDiff -= 64;
5124     while ( 0 < expDiff ) {
5125         q = estimateDiv128To64( aSig, 0, bSig );
5126         q = ( 2 < q ) ? q - 2 : 0;
5127         aSig = - ( ( bSig>>2 ) * q );
5128         expDiff -= 62;
5129     }
5130     expDiff += 64;
5131     if ( 0 < expDiff ) {
5132         q = estimateDiv128To64( aSig, 0, bSig );
5133         q = ( 2 < q ) ? q - 2 : 0;
5134         q >>= 64 - expDiff;
5135         bSig >>= 2;
5136         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5137     }
5138     else {
5139         aSig >>= 2;
5140         bSig >>= 2;
5141     }
5142     do {
5143         alternateASig = aSig;
5144         ++q;
5145         aSig -= bSig;
5146     } while ( 0 <= (int64_t) aSig );
5147     sigMean = aSig + alternateASig;
5148     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5149         aSig = alternateASig;
5150     }
5151     zSign = ( (int64_t) aSig < 0 );
5152     if ( zSign ) aSig = - aSig;
5153     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5154 
5155 }
5156 
5157 /*----------------------------------------------------------------------------
5158 | Returns the binary log of the double-precision floating-point value `a'.
5159 | The operation is performed according to the IEC/IEEE Standard for Binary
5160 | Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162 float64 float64_log2(float64 a, float_status *status)
5163 {
5164     flag aSign, zSign;
5165     int aExp;
5166     uint64_t aSig, aSig0, aSig1, zSig, i;
5167     a = float64_squash_input_denormal(a, status);
5168 
5169     aSig = extractFloat64Frac( a );
5170     aExp = extractFloat64Exp( a );
5171     aSign = extractFloat64Sign( a );
5172 
5173     if ( aExp == 0 ) {
5174         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5175         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5176     }
5177     if ( aSign ) {
5178         float_raise(float_flag_invalid, status);
5179         return float64_default_nan(status);
5180     }
5181     if ( aExp == 0x7FF ) {
5182         if (aSig) {
5183             return propagateFloat64NaN(a, float64_zero, status);
5184         }
5185         return a;
5186     }
5187 
5188     aExp -= 0x3FF;
5189     aSig |= UINT64_C(0x0010000000000000);
5190     zSign = aExp < 0;
5191     zSig = (uint64_t)aExp << 52;
5192     for (i = 1LL << 51; i > 0; i >>= 1) {
5193         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5194         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5195         if ( aSig & UINT64_C(0x0020000000000000) ) {
5196             aSig >>= 1;
5197             zSig |= i;
5198         }
5199     }
5200 
5201     if ( zSign )
5202         zSig = -zSig;
5203     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5204 }
5205 
5206 /*----------------------------------------------------------------------------
5207 | Returns 1 if the double-precision floating-point value `a' is equal to the
5208 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5209 | if either operand is a NaN.  Otherwise, the comparison is performed
5210 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5211 *----------------------------------------------------------------------------*/
5212 
5213 int float64_eq(float64 a, float64 b, float_status *status)
5214 {
5215     uint64_t av, bv;
5216     a = float64_squash_input_denormal(a, status);
5217     b = float64_squash_input_denormal(b, status);
5218 
5219     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5220          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5221        ) {
5222         float_raise(float_flag_invalid, status);
5223         return 0;
5224     }
5225     av = float64_val(a);
5226     bv = float64_val(b);
5227     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5228 
5229 }
5230 
5231 /*----------------------------------------------------------------------------
5232 | Returns 1 if the double-precision floating-point value `a' is less than or
5233 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5234 | exception is raised if either operand is a NaN.  The comparison is performed
5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 int float64_le(float64 a, float64 b, float_status *status)
5239 {
5240     flag aSign, bSign;
5241     uint64_t av, bv;
5242     a = float64_squash_input_denormal(a, status);
5243     b = float64_squash_input_denormal(b, status);
5244 
5245     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5246          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5247        ) {
5248         float_raise(float_flag_invalid, status);
5249         return 0;
5250     }
5251     aSign = extractFloat64Sign( a );
5252     bSign = extractFloat64Sign( b );
5253     av = float64_val(a);
5254     bv = float64_val(b);
5255     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5256     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5257 
5258 }
5259 
5260 /*----------------------------------------------------------------------------
5261 | Returns 1 if the double-precision floating-point value `a' is less than
5262 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5263 | raised if either operand is a NaN.  The comparison is performed according
5264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 *----------------------------------------------------------------------------*/
5266 
5267 int float64_lt(float64 a, float64 b, float_status *status)
5268 {
5269     flag aSign, bSign;
5270     uint64_t av, bv;
5271 
5272     a = float64_squash_input_denormal(a, status);
5273     b = float64_squash_input_denormal(b, status);
5274     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5275          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5276        ) {
5277         float_raise(float_flag_invalid, status);
5278         return 0;
5279     }
5280     aSign = extractFloat64Sign( a );
5281     bSign = extractFloat64Sign( b );
5282     av = float64_val(a);
5283     bv = float64_val(b);
5284     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5285     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5286 
5287 }
5288 
5289 /*----------------------------------------------------------------------------
5290 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5291 | be compared, and 0 otherwise.  The invalid exception is raised if either
5292 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5293 | Standard for Binary Floating-Point Arithmetic.
5294 *----------------------------------------------------------------------------*/
5295 
5296 int float64_unordered(float64 a, float64 b, float_status *status)
5297 {
5298     a = float64_squash_input_denormal(a, status);
5299     b = float64_squash_input_denormal(b, status);
5300 
5301     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5302          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5303        ) {
5304         float_raise(float_flag_invalid, status);
5305         return 1;
5306     }
5307     return 0;
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is equal to the
5312 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5313 | exception.The comparison is performed according to the IEC/IEEE Standard
5314 | for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5316 
5317 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5318 {
5319     uint64_t av, bv;
5320     a = float64_squash_input_denormal(a, status);
5321     b = float64_squash_input_denormal(b, status);
5322 
5323     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5324          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5325        ) {
5326         if (float64_is_signaling_nan(a, status)
5327          || float64_is_signaling_nan(b, status)) {
5328             float_raise(float_flag_invalid, status);
5329         }
5330         return 0;
5331     }
5332     av = float64_val(a);
5333     bv = float64_val(b);
5334     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5335 
5336 }
5337 
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the double-precision floating-point value `a' is less than or
5340 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5341 | cause an exception.  Otherwise, the comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 
5345 int float64_le_quiet(float64 a, float64 b, float_status *status)
5346 {
5347     flag aSign, bSign;
5348     uint64_t av, bv;
5349     a = float64_squash_input_denormal(a, status);
5350     b = float64_squash_input_denormal(b, status);
5351 
5352     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5353          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5354        ) {
5355         if (float64_is_signaling_nan(a, status)
5356          || float64_is_signaling_nan(b, status)) {
5357             float_raise(float_flag_invalid, status);
5358         }
5359         return 0;
5360     }
5361     aSign = extractFloat64Sign( a );
5362     bSign = extractFloat64Sign( b );
5363     av = float64_val(a);
5364     bv = float64_val(b);
5365     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5366     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5367 
5368 }
5369 
5370 /*----------------------------------------------------------------------------
5371 | Returns 1 if the double-precision floating-point value `a' is less than
5372 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5373 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5374 | Standard for Binary Floating-Point Arithmetic.
5375 *----------------------------------------------------------------------------*/
5376 
5377 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5378 {
5379     flag aSign, bSign;
5380     uint64_t av, bv;
5381     a = float64_squash_input_denormal(a, status);
5382     b = float64_squash_input_denormal(b, status);
5383 
5384     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5385          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5386        ) {
5387         if (float64_is_signaling_nan(a, status)
5388          || float64_is_signaling_nan(b, status)) {
5389             float_raise(float_flag_invalid, status);
5390         }
5391         return 0;
5392     }
5393     aSign = extractFloat64Sign( a );
5394     bSign = extractFloat64Sign( b );
5395     av = float64_val(a);
5396     bv = float64_val(b);
5397     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5398     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5399 
5400 }
5401 
5402 /*----------------------------------------------------------------------------
5403 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5404 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5405 | comparison is performed according to the IEC/IEEE Standard for Binary
5406 | Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408 
5409 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5410 {
5411     a = float64_squash_input_denormal(a, status);
5412     b = float64_squash_input_denormal(b, status);
5413 
5414     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5415          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5416        ) {
5417         if (float64_is_signaling_nan(a, status)
5418          || float64_is_signaling_nan(b, status)) {
5419             float_raise(float_flag_invalid, status);
5420         }
5421         return 1;
5422     }
5423     return 0;
5424 }
5425 
5426 /*----------------------------------------------------------------------------
5427 | Returns the result of converting the extended double-precision floating-
5428 | point value `a' to the 32-bit two's complement integer format.  The
5429 | conversion is performed according to the IEC/IEEE Standard for Binary
5430 | Floating-Point Arithmetic---which means in particular that the conversion
5431 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5432 | largest positive integer is returned.  Otherwise, if the conversion
5433 | overflows, the largest integer with the same sign as `a' is returned.
5434 *----------------------------------------------------------------------------*/
5435 
5436 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5437 {
5438     flag aSign;
5439     int32_t aExp, shiftCount;
5440     uint64_t aSig;
5441 
5442     if (floatx80_invalid_encoding(a)) {
5443         float_raise(float_flag_invalid, status);
5444         return 1 << 31;
5445     }
5446     aSig = extractFloatx80Frac( a );
5447     aExp = extractFloatx80Exp( a );
5448     aSign = extractFloatx80Sign( a );
5449     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5450     shiftCount = 0x4037 - aExp;
5451     if ( shiftCount <= 0 ) shiftCount = 1;
5452     shift64RightJamming( aSig, shiftCount, &aSig );
5453     return roundAndPackInt32(aSign, aSig, status);
5454 
5455 }
5456 
5457 /*----------------------------------------------------------------------------
5458 | Returns the result of converting the extended double-precision floating-
5459 | point value `a' to the 32-bit two's complement integer format.  The
5460 | conversion is performed according to the IEC/IEEE Standard for Binary
5461 | Floating-Point Arithmetic, except that the conversion is always rounded
5462 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5463 | Otherwise, if the conversion overflows, the largest integer with the same
5464 | sign as `a' is returned.
5465 *----------------------------------------------------------------------------*/
5466 
5467 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5468 {
5469     flag aSign;
5470     int32_t aExp, shiftCount;
5471     uint64_t aSig, savedASig;
5472     int32_t z;
5473 
5474     if (floatx80_invalid_encoding(a)) {
5475         float_raise(float_flag_invalid, status);
5476         return 1 << 31;
5477     }
5478     aSig = extractFloatx80Frac( a );
5479     aExp = extractFloatx80Exp( a );
5480     aSign = extractFloatx80Sign( a );
5481     if ( 0x401E < aExp ) {
5482         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5483         goto invalid;
5484     }
5485     else if ( aExp < 0x3FFF ) {
5486         if (aExp || aSig) {
5487             status->float_exception_flags |= float_flag_inexact;
5488         }
5489         return 0;
5490     }
5491     shiftCount = 0x403E - aExp;
5492     savedASig = aSig;
5493     aSig >>= shiftCount;
5494     z = aSig;
5495     if ( aSign ) z = - z;
5496     if ( ( z < 0 ) ^ aSign ) {
5497  invalid:
5498         float_raise(float_flag_invalid, status);
5499         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5500     }
5501     if ( ( aSig<<shiftCount ) != savedASig ) {
5502         status->float_exception_flags |= float_flag_inexact;
5503     }
5504     return z;
5505 
5506 }
5507 
5508 /*----------------------------------------------------------------------------
5509 | Returns the result of converting the extended double-precision floating-
5510 | point value `a' to the 64-bit two's complement integer format.  The
5511 | conversion is performed according to the IEC/IEEE Standard for Binary
5512 | Floating-Point Arithmetic---which means in particular that the conversion
5513 | is rounded according to the current rounding mode.  If `a' is a NaN,
5514 | the largest positive integer is returned.  Otherwise, if the conversion
5515 | overflows, the largest integer with the same sign as `a' is returned.
5516 *----------------------------------------------------------------------------*/
5517 
5518 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5519 {
5520     flag aSign;
5521     int32_t aExp, shiftCount;
5522     uint64_t aSig, aSigExtra;
5523 
5524     if (floatx80_invalid_encoding(a)) {
5525         float_raise(float_flag_invalid, status);
5526         return 1ULL << 63;
5527     }
5528     aSig = extractFloatx80Frac( a );
5529     aExp = extractFloatx80Exp( a );
5530     aSign = extractFloatx80Sign( a );
5531     shiftCount = 0x403E - aExp;
5532     if ( shiftCount <= 0 ) {
5533         if ( shiftCount ) {
5534             float_raise(float_flag_invalid, status);
5535             if (!aSign || floatx80_is_any_nan(a)) {
5536                 return INT64_MAX;
5537             }
5538             return INT64_MIN;
5539         }
5540         aSigExtra = 0;
5541     }
5542     else {
5543         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5544     }
5545     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5546 
5547 }
5548 
5549 /*----------------------------------------------------------------------------
5550 | Returns the result of converting the extended double-precision floating-
5551 | point value `a' to the 64-bit two's complement integer format.  The
5552 | conversion is performed according to the IEC/IEEE Standard for Binary
5553 | Floating-Point Arithmetic, except that the conversion is always rounded
5554 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5555 | Otherwise, if the conversion overflows, the largest integer with the same
5556 | sign as `a' is returned.
5557 *----------------------------------------------------------------------------*/
5558 
5559 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5560 {
5561     flag aSign;
5562     int32_t aExp, shiftCount;
5563     uint64_t aSig;
5564     int64_t z;
5565 
5566     if (floatx80_invalid_encoding(a)) {
5567         float_raise(float_flag_invalid, status);
5568         return 1ULL << 63;
5569     }
5570     aSig = extractFloatx80Frac( a );
5571     aExp = extractFloatx80Exp( a );
5572     aSign = extractFloatx80Sign( a );
5573     shiftCount = aExp - 0x403E;
5574     if ( 0 <= shiftCount ) {
5575         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5576         if ( ( a.high != 0xC03E ) || aSig ) {
5577             float_raise(float_flag_invalid, status);
5578             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5579                 return INT64_MAX;
5580             }
5581         }
5582         return INT64_MIN;
5583     }
5584     else if ( aExp < 0x3FFF ) {
5585         if (aExp | aSig) {
5586             status->float_exception_flags |= float_flag_inexact;
5587         }
5588         return 0;
5589     }
5590     z = aSig>>( - shiftCount );
5591     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5592         status->float_exception_flags |= float_flag_inexact;
5593     }
5594     if ( aSign ) z = - z;
5595     return z;
5596 
5597 }
5598 
5599 /*----------------------------------------------------------------------------
5600 | Returns the result of converting the extended double-precision floating-
5601 | point value `a' to the single-precision floating-point format.  The
5602 | conversion is performed according to the IEC/IEEE Standard for Binary
5603 | Floating-Point Arithmetic.
5604 *----------------------------------------------------------------------------*/
5605 
5606 float32 floatx80_to_float32(floatx80 a, float_status *status)
5607 {
5608     flag aSign;
5609     int32_t aExp;
5610     uint64_t aSig;
5611 
5612     if (floatx80_invalid_encoding(a)) {
5613         float_raise(float_flag_invalid, status);
5614         return float32_default_nan(status);
5615     }
5616     aSig = extractFloatx80Frac( a );
5617     aExp = extractFloatx80Exp( a );
5618     aSign = extractFloatx80Sign( a );
5619     if ( aExp == 0x7FFF ) {
5620         if ( (uint64_t) ( aSig<<1 ) ) {
5621             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5622         }
5623         return packFloat32( aSign, 0xFF, 0 );
5624     }
5625     shift64RightJamming( aSig, 33, &aSig );
5626     if ( aExp || aSig ) aExp -= 0x3F81;
5627     return roundAndPackFloat32(aSign, aExp, aSig, status);
5628 
5629 }
5630 
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the extended double-precision floating-
5633 | point value `a' to the double-precision floating-point format.  The
5634 | conversion is performed according to the IEC/IEEE Standard for Binary
5635 | Floating-Point Arithmetic.
5636 *----------------------------------------------------------------------------*/
5637 
5638 float64 floatx80_to_float64(floatx80 a, float_status *status)
5639 {
5640     flag aSign;
5641     int32_t aExp;
5642     uint64_t aSig, zSig;
5643 
5644     if (floatx80_invalid_encoding(a)) {
5645         float_raise(float_flag_invalid, status);
5646         return float64_default_nan(status);
5647     }
5648     aSig = extractFloatx80Frac( a );
5649     aExp = extractFloatx80Exp( a );
5650     aSign = extractFloatx80Sign( a );
5651     if ( aExp == 0x7FFF ) {
5652         if ( (uint64_t) ( aSig<<1 ) ) {
5653             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5654         }
5655         return packFloat64( aSign, 0x7FF, 0 );
5656     }
5657     shift64RightJamming( aSig, 1, &zSig );
5658     if ( aExp || aSig ) aExp -= 0x3C01;
5659     return roundAndPackFloat64(aSign, aExp, zSig, status);
5660 
5661 }
5662 
5663 /*----------------------------------------------------------------------------
5664 | Returns the result of converting the extended double-precision floating-
5665 | point value `a' to the quadruple-precision floating-point format.  The
5666 | conversion is performed according to the IEC/IEEE Standard for Binary
5667 | Floating-Point Arithmetic.
5668 *----------------------------------------------------------------------------*/
5669 
5670 float128 floatx80_to_float128(floatx80 a, float_status *status)
5671 {
5672     flag aSign;
5673     int aExp;
5674     uint64_t aSig, zSig0, zSig1;
5675 
5676     if (floatx80_invalid_encoding(a)) {
5677         float_raise(float_flag_invalid, status);
5678         return float128_default_nan(status);
5679     }
5680     aSig = extractFloatx80Frac( a );
5681     aExp = extractFloatx80Exp( a );
5682     aSign = extractFloatx80Sign( a );
5683     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5684         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5685     }
5686     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5687     return packFloat128( aSign, aExp, zSig0, zSig1 );
5688 
5689 }
5690 
5691 /*----------------------------------------------------------------------------
5692 | Rounds the extended double-precision floating-point value `a'
5693 | to the precision provided by floatx80_rounding_precision and returns the
5694 | result as an extended double-precision floating-point value.
5695 | The operation is performed according to the IEC/IEEE Standard for Binary
5696 | Floating-Point Arithmetic.
5697 *----------------------------------------------------------------------------*/
5698 
5699 floatx80 floatx80_round(floatx80 a, float_status *status)
5700 {
5701     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5702                                 extractFloatx80Sign(a),
5703                                 extractFloatx80Exp(a),
5704                                 extractFloatx80Frac(a), 0, status);
5705 }
5706 
5707 /*----------------------------------------------------------------------------
5708 | Rounds the extended double-precision floating-point value `a' to an integer,
5709 | and returns the result as an extended quadruple-precision floating-point
5710 | value.  The operation is performed according to the IEC/IEEE Standard for
5711 | Binary Floating-Point Arithmetic.
5712 *----------------------------------------------------------------------------*/
5713 
5714 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5715 {
5716     flag aSign;
5717     int32_t aExp;
5718     uint64_t lastBitMask, roundBitsMask;
5719     floatx80 z;
5720 
5721     if (floatx80_invalid_encoding(a)) {
5722         float_raise(float_flag_invalid, status);
5723         return floatx80_default_nan(status);
5724     }
5725     aExp = extractFloatx80Exp( a );
5726     if ( 0x403E <= aExp ) {
5727         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5728             return propagateFloatx80NaN(a, a, status);
5729         }
5730         return a;
5731     }
5732     if ( aExp < 0x3FFF ) {
5733         if (    ( aExp == 0 )
5734              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5735             return a;
5736         }
5737         status->float_exception_flags |= float_flag_inexact;
5738         aSign = extractFloatx80Sign( a );
5739         switch (status->float_rounding_mode) {
5740          case float_round_nearest_even:
5741             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5742                ) {
5743                 return
5744                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5745             }
5746             break;
5747         case float_round_ties_away:
5748             if (aExp == 0x3FFE) {
5749                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5750             }
5751             break;
5752          case float_round_down:
5753             return
5754                   aSign ?
5755                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5756                 : packFloatx80( 0, 0, 0 );
5757          case float_round_up:
5758             return
5759                   aSign ? packFloatx80( 1, 0, 0 )
5760                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5761         }
5762         return packFloatx80( aSign, 0, 0 );
5763     }
5764     lastBitMask = 1;
5765     lastBitMask <<= 0x403E - aExp;
5766     roundBitsMask = lastBitMask - 1;
5767     z = a;
5768     switch (status->float_rounding_mode) {
5769     case float_round_nearest_even:
5770         z.low += lastBitMask>>1;
5771         if ((z.low & roundBitsMask) == 0) {
5772             z.low &= ~lastBitMask;
5773         }
5774         break;
5775     case float_round_ties_away:
5776         z.low += lastBitMask >> 1;
5777         break;
5778     case float_round_to_zero:
5779         break;
5780     case float_round_up:
5781         if (!extractFloatx80Sign(z)) {
5782             z.low += roundBitsMask;
5783         }
5784         break;
5785     case float_round_down:
5786         if (extractFloatx80Sign(z)) {
5787             z.low += roundBitsMask;
5788         }
5789         break;
5790     default:
5791         abort();
5792     }
5793     z.low &= ~ roundBitsMask;
5794     if ( z.low == 0 ) {
5795         ++z.high;
5796         z.low = UINT64_C(0x8000000000000000);
5797     }
5798     if (z.low != a.low) {
5799         status->float_exception_flags |= float_flag_inexact;
5800     }
5801     return z;
5802 
5803 }
5804 
5805 /*----------------------------------------------------------------------------
5806 | Returns the result of adding the absolute values of the extended double-
5807 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5808 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5809 | The addition is performed according to the IEC/IEEE Standard for Binary
5810 | Floating-Point Arithmetic.
5811 *----------------------------------------------------------------------------*/
5812 
5813 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5814                                 float_status *status)
5815 {
5816     int32_t aExp, bExp, zExp;
5817     uint64_t aSig, bSig, zSig0, zSig1;
5818     int32_t expDiff;
5819 
5820     aSig = extractFloatx80Frac( a );
5821     aExp = extractFloatx80Exp( a );
5822     bSig = extractFloatx80Frac( b );
5823     bExp = extractFloatx80Exp( b );
5824     expDiff = aExp - bExp;
5825     if ( 0 < expDiff ) {
5826         if ( aExp == 0x7FFF ) {
5827             if ((uint64_t)(aSig << 1)) {
5828                 return propagateFloatx80NaN(a, b, status);
5829             }
5830             return a;
5831         }
5832         if ( bExp == 0 ) --expDiff;
5833         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5834         zExp = aExp;
5835     }
5836     else if ( expDiff < 0 ) {
5837         if ( bExp == 0x7FFF ) {
5838             if ((uint64_t)(bSig << 1)) {
5839                 return propagateFloatx80NaN(a, b, status);
5840             }
5841             return packFloatx80(zSign,
5842                                 floatx80_infinity_high,
5843                                 floatx80_infinity_low);
5844         }
5845         if ( aExp == 0 ) ++expDiff;
5846         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5847         zExp = bExp;
5848     }
5849     else {
5850         if ( aExp == 0x7FFF ) {
5851             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5852                 return propagateFloatx80NaN(a, b, status);
5853             }
5854             return a;
5855         }
5856         zSig1 = 0;
5857         zSig0 = aSig + bSig;
5858         if ( aExp == 0 ) {
5859             if (zSig0 == 0) {
5860                 return packFloatx80(zSign, 0, 0);
5861             }
5862             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5863             goto roundAndPack;
5864         }
5865         zExp = aExp;
5866         goto shiftRight1;
5867     }
5868     zSig0 = aSig + bSig;
5869     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5870  shiftRight1:
5871     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5872     zSig0 |= UINT64_C(0x8000000000000000);
5873     ++zExp;
5874  roundAndPack:
5875     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5876                                 zSign, zExp, zSig0, zSig1, status);
5877 }
5878 
5879 /*----------------------------------------------------------------------------
5880 | Returns the result of subtracting the absolute values of the extended
5881 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5882 | difference is negated before being returned.  `zSign' is ignored if the
5883 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5884 | Standard for Binary Floating-Point Arithmetic.
5885 *----------------------------------------------------------------------------*/
5886 
5887 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5888                                 float_status *status)
5889 {
5890     int32_t aExp, bExp, zExp;
5891     uint64_t aSig, bSig, zSig0, zSig1;
5892     int32_t expDiff;
5893 
5894     aSig = extractFloatx80Frac( a );
5895     aExp = extractFloatx80Exp( a );
5896     bSig = extractFloatx80Frac( b );
5897     bExp = extractFloatx80Exp( b );
5898     expDiff = aExp - bExp;
5899     if ( 0 < expDiff ) goto aExpBigger;
5900     if ( expDiff < 0 ) goto bExpBigger;
5901     if ( aExp == 0x7FFF ) {
5902         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5903             return propagateFloatx80NaN(a, b, status);
5904         }
5905         float_raise(float_flag_invalid, status);
5906         return floatx80_default_nan(status);
5907     }
5908     if ( aExp == 0 ) {
5909         aExp = 1;
5910         bExp = 1;
5911     }
5912     zSig1 = 0;
5913     if ( bSig < aSig ) goto aBigger;
5914     if ( aSig < bSig ) goto bBigger;
5915     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5916  bExpBigger:
5917     if ( bExp == 0x7FFF ) {
5918         if ((uint64_t)(bSig << 1)) {
5919             return propagateFloatx80NaN(a, b, status);
5920         }
5921         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5922                             floatx80_infinity_low);
5923     }
5924     if ( aExp == 0 ) ++expDiff;
5925     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5926  bBigger:
5927     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5928     zExp = bExp;
5929     zSign ^= 1;
5930     goto normalizeRoundAndPack;
5931  aExpBigger:
5932     if ( aExp == 0x7FFF ) {
5933         if ((uint64_t)(aSig << 1)) {
5934             return propagateFloatx80NaN(a, b, status);
5935         }
5936         return a;
5937     }
5938     if ( bExp == 0 ) --expDiff;
5939     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5940  aBigger:
5941     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5942     zExp = aExp;
5943  normalizeRoundAndPack:
5944     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5945                                          zSign, zExp, zSig0, zSig1, status);
5946 }
5947 
5948 /*----------------------------------------------------------------------------
5949 | Returns the result of adding the extended double-precision floating-point
5950 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5951 | Standard for Binary Floating-Point Arithmetic.
5952 *----------------------------------------------------------------------------*/
5953 
5954 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5955 {
5956     flag aSign, bSign;
5957 
5958     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5959         float_raise(float_flag_invalid, status);
5960         return floatx80_default_nan(status);
5961     }
5962     aSign = extractFloatx80Sign( a );
5963     bSign = extractFloatx80Sign( b );
5964     if ( aSign == bSign ) {
5965         return addFloatx80Sigs(a, b, aSign, status);
5966     }
5967     else {
5968         return subFloatx80Sigs(a, b, aSign, status);
5969     }
5970 
5971 }
5972 
5973 /*----------------------------------------------------------------------------
5974 | Returns the result of subtracting the extended double-precision floating-
5975 | point values `a' and `b'.  The operation is performed according to the
5976 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5977 *----------------------------------------------------------------------------*/
5978 
5979 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5980 {
5981     flag aSign, bSign;
5982 
5983     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5984         float_raise(float_flag_invalid, status);
5985         return floatx80_default_nan(status);
5986     }
5987     aSign = extractFloatx80Sign( a );
5988     bSign = extractFloatx80Sign( b );
5989     if ( aSign == bSign ) {
5990         return subFloatx80Sigs(a, b, aSign, status);
5991     }
5992     else {
5993         return addFloatx80Sigs(a, b, aSign, status);
5994     }
5995 
5996 }
5997 
5998 /*----------------------------------------------------------------------------
5999 | Returns the result of multiplying the extended double-precision floating-
6000 | point values `a' and `b'.  The operation is performed according to the
6001 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6002 *----------------------------------------------------------------------------*/
6003 
6004 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6005 {
6006     flag aSign, bSign, zSign;
6007     int32_t aExp, bExp, zExp;
6008     uint64_t aSig, bSig, zSig0, zSig1;
6009 
6010     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6011         float_raise(float_flag_invalid, status);
6012         return floatx80_default_nan(status);
6013     }
6014     aSig = extractFloatx80Frac( a );
6015     aExp = extractFloatx80Exp( a );
6016     aSign = extractFloatx80Sign( a );
6017     bSig = extractFloatx80Frac( b );
6018     bExp = extractFloatx80Exp( b );
6019     bSign = extractFloatx80Sign( b );
6020     zSign = aSign ^ bSign;
6021     if ( aExp == 0x7FFF ) {
6022         if (    (uint64_t) ( aSig<<1 )
6023              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6024             return propagateFloatx80NaN(a, b, status);
6025         }
6026         if ( ( bExp | bSig ) == 0 ) goto invalid;
6027         return packFloatx80(zSign, floatx80_infinity_high,
6028                                    floatx80_infinity_low);
6029     }
6030     if ( bExp == 0x7FFF ) {
6031         if ((uint64_t)(bSig << 1)) {
6032             return propagateFloatx80NaN(a, b, status);
6033         }
6034         if ( ( aExp | aSig ) == 0 ) {
6035  invalid:
6036             float_raise(float_flag_invalid, status);
6037             return floatx80_default_nan(status);
6038         }
6039         return packFloatx80(zSign, floatx80_infinity_high,
6040                                    floatx80_infinity_low);
6041     }
6042     if ( aExp == 0 ) {
6043         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6044         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6045     }
6046     if ( bExp == 0 ) {
6047         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6048         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6049     }
6050     zExp = aExp + bExp - 0x3FFE;
6051     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6052     if ( 0 < (int64_t) zSig0 ) {
6053         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6054         --zExp;
6055     }
6056     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6057                                 zSign, zExp, zSig0, zSig1, status);
6058 }
6059 
6060 /*----------------------------------------------------------------------------
6061 | Returns the result of dividing the extended double-precision floating-point
6062 | value `a' by the corresponding value `b'.  The operation is performed
6063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6064 *----------------------------------------------------------------------------*/
6065 
6066 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6067 {
6068     flag aSign, bSign, zSign;
6069     int32_t aExp, bExp, zExp;
6070     uint64_t aSig, bSig, zSig0, zSig1;
6071     uint64_t rem0, rem1, rem2, term0, term1, term2;
6072 
6073     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6074         float_raise(float_flag_invalid, status);
6075         return floatx80_default_nan(status);
6076     }
6077     aSig = extractFloatx80Frac( a );
6078     aExp = extractFloatx80Exp( a );
6079     aSign = extractFloatx80Sign( a );
6080     bSig = extractFloatx80Frac( b );
6081     bExp = extractFloatx80Exp( b );
6082     bSign = extractFloatx80Sign( b );
6083     zSign = aSign ^ bSign;
6084     if ( aExp == 0x7FFF ) {
6085         if ((uint64_t)(aSig << 1)) {
6086             return propagateFloatx80NaN(a, b, status);
6087         }
6088         if ( bExp == 0x7FFF ) {
6089             if ((uint64_t)(bSig << 1)) {
6090                 return propagateFloatx80NaN(a, b, status);
6091             }
6092             goto invalid;
6093         }
6094         return packFloatx80(zSign, floatx80_infinity_high,
6095                                    floatx80_infinity_low);
6096     }
6097     if ( bExp == 0x7FFF ) {
6098         if ((uint64_t)(bSig << 1)) {
6099             return propagateFloatx80NaN(a, b, status);
6100         }
6101         return packFloatx80( zSign, 0, 0 );
6102     }
6103     if ( bExp == 0 ) {
6104         if ( bSig == 0 ) {
6105             if ( ( aExp | aSig ) == 0 ) {
6106  invalid:
6107                 float_raise(float_flag_invalid, status);
6108                 return floatx80_default_nan(status);
6109             }
6110             float_raise(float_flag_divbyzero, status);
6111             return packFloatx80(zSign, floatx80_infinity_high,
6112                                        floatx80_infinity_low);
6113         }
6114         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6115     }
6116     if ( aExp == 0 ) {
6117         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6118         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6119     }
6120     zExp = aExp - bExp + 0x3FFE;
6121     rem1 = 0;
6122     if ( bSig <= aSig ) {
6123         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6124         ++zExp;
6125     }
6126     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6127     mul64To128( bSig, zSig0, &term0, &term1 );
6128     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6129     while ( (int64_t) rem0 < 0 ) {
6130         --zSig0;
6131         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6132     }
6133     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6134     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6135         mul64To128( bSig, zSig1, &term1, &term2 );
6136         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6137         while ( (int64_t) rem1 < 0 ) {
6138             --zSig1;
6139             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6140         }
6141         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6142     }
6143     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6144                                 zSign, zExp, zSig0, zSig1, status);
6145 }
6146 
6147 /*----------------------------------------------------------------------------
6148 | Returns the remainder of the extended double-precision floating-point value
6149 | `a' with respect to the corresponding value `b'.  The operation is performed
6150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6151 *----------------------------------------------------------------------------*/
6152 
6153 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6154 {
6155     flag aSign, zSign;
6156     int32_t aExp, bExp, expDiff;
6157     uint64_t aSig0, aSig1, bSig;
6158     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6159 
6160     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6161         float_raise(float_flag_invalid, status);
6162         return floatx80_default_nan(status);
6163     }
6164     aSig0 = extractFloatx80Frac( a );
6165     aExp = extractFloatx80Exp( a );
6166     aSign = extractFloatx80Sign( a );
6167     bSig = extractFloatx80Frac( b );
6168     bExp = extractFloatx80Exp( b );
6169     if ( aExp == 0x7FFF ) {
6170         if (    (uint64_t) ( aSig0<<1 )
6171              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6172             return propagateFloatx80NaN(a, b, status);
6173         }
6174         goto invalid;
6175     }
6176     if ( bExp == 0x7FFF ) {
6177         if ((uint64_t)(bSig << 1)) {
6178             return propagateFloatx80NaN(a, b, status);
6179         }
6180         return a;
6181     }
6182     if ( bExp == 0 ) {
6183         if ( bSig == 0 ) {
6184  invalid:
6185             float_raise(float_flag_invalid, status);
6186             return floatx80_default_nan(status);
6187         }
6188         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6189     }
6190     if ( aExp == 0 ) {
6191         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6192         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6193     }
6194     bSig |= UINT64_C(0x8000000000000000);
6195     zSign = aSign;
6196     expDiff = aExp - bExp;
6197     aSig1 = 0;
6198     if ( expDiff < 0 ) {
6199         if ( expDiff < -1 ) return a;
6200         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6201         expDiff = 0;
6202     }
6203     q = ( bSig <= aSig0 );
6204     if ( q ) aSig0 -= bSig;
6205     expDiff -= 64;
6206     while ( 0 < expDiff ) {
6207         q = estimateDiv128To64( aSig0, aSig1, bSig );
6208         q = ( 2 < q ) ? q - 2 : 0;
6209         mul64To128( bSig, q, &term0, &term1 );
6210         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6211         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6212         expDiff -= 62;
6213     }
6214     expDiff += 64;
6215     if ( 0 < expDiff ) {
6216         q = estimateDiv128To64( aSig0, aSig1, bSig );
6217         q = ( 2 < q ) ? q - 2 : 0;
6218         q >>= 64 - expDiff;
6219         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6220         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6221         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6222         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6223             ++q;
6224             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6225         }
6226     }
6227     else {
6228         term1 = 0;
6229         term0 = bSig;
6230     }
6231     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6232     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6233          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6234               && ( q & 1 ) )
6235        ) {
6236         aSig0 = alternateASig0;
6237         aSig1 = alternateASig1;
6238         zSign = ! zSign;
6239     }
6240     return
6241         normalizeRoundAndPackFloatx80(
6242             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6243 
6244 }
6245 
6246 /*----------------------------------------------------------------------------
6247 | Returns the square root of the extended double-precision floating-point
6248 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6249 | for Binary Floating-Point Arithmetic.
6250 *----------------------------------------------------------------------------*/
6251 
6252 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6253 {
6254     flag aSign;
6255     int32_t aExp, zExp;
6256     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6257     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6258 
6259     if (floatx80_invalid_encoding(a)) {
6260         float_raise(float_flag_invalid, status);
6261         return floatx80_default_nan(status);
6262     }
6263     aSig0 = extractFloatx80Frac( a );
6264     aExp = extractFloatx80Exp( a );
6265     aSign = extractFloatx80Sign( a );
6266     if ( aExp == 0x7FFF ) {
6267         if ((uint64_t)(aSig0 << 1)) {
6268             return propagateFloatx80NaN(a, a, status);
6269         }
6270         if ( ! aSign ) return a;
6271         goto invalid;
6272     }
6273     if ( aSign ) {
6274         if ( ( aExp | aSig0 ) == 0 ) return a;
6275  invalid:
6276         float_raise(float_flag_invalid, status);
6277         return floatx80_default_nan(status);
6278     }
6279     if ( aExp == 0 ) {
6280         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6281         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6282     }
6283     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6284     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6285     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6286     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6287     doubleZSig0 = zSig0<<1;
6288     mul64To128( zSig0, zSig0, &term0, &term1 );
6289     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6290     while ( (int64_t) rem0 < 0 ) {
6291         --zSig0;
6292         doubleZSig0 -= 2;
6293         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6294     }
6295     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6296     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6297         if ( zSig1 == 0 ) zSig1 = 1;
6298         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6299         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6300         mul64To128( zSig1, zSig1, &term2, &term3 );
6301         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6302         while ( (int64_t) rem1 < 0 ) {
6303             --zSig1;
6304             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6305             term3 |= 1;
6306             term2 |= doubleZSig0;
6307             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6308         }
6309         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6310     }
6311     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6312     zSig0 |= doubleZSig0;
6313     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6314                                 0, zExp, zSig0, zSig1, status);
6315 }
6316 
6317 /*----------------------------------------------------------------------------
6318 | Returns 1 if the extended double-precision floating-point value `a' is equal
6319 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6320 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6321 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6322 *----------------------------------------------------------------------------*/
6323 
6324 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6325 {
6326 
6327     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6328         || (extractFloatx80Exp(a) == 0x7FFF
6329             && (uint64_t) (extractFloatx80Frac(a) << 1))
6330         || (extractFloatx80Exp(b) == 0x7FFF
6331             && (uint64_t) (extractFloatx80Frac(b) << 1))
6332        ) {
6333         float_raise(float_flag_invalid, status);
6334         return 0;
6335     }
6336     return
6337            ( a.low == b.low )
6338         && (    ( a.high == b.high )
6339              || (    ( a.low == 0 )
6340                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6341            );
6342 
6343 }
6344 
6345 /*----------------------------------------------------------------------------
6346 | Returns 1 if the extended double-precision floating-point value `a' is
6347 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6348 | invalid exception is raised if either operand is a NaN.  The comparison is
6349 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6350 | Arithmetic.
6351 *----------------------------------------------------------------------------*/
6352 
6353 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6354 {
6355     flag aSign, bSign;
6356 
6357     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6358         || (extractFloatx80Exp(a) == 0x7FFF
6359             && (uint64_t) (extractFloatx80Frac(a) << 1))
6360         || (extractFloatx80Exp(b) == 0x7FFF
6361             && (uint64_t) (extractFloatx80Frac(b) << 1))
6362        ) {
6363         float_raise(float_flag_invalid, status);
6364         return 0;
6365     }
6366     aSign = extractFloatx80Sign( a );
6367     bSign = extractFloatx80Sign( b );
6368     if ( aSign != bSign ) {
6369         return
6370                aSign
6371             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6372                  == 0 );
6373     }
6374     return
6375           aSign ? le128( b.high, b.low, a.high, a.low )
6376         : le128( a.high, a.low, b.high, b.low );
6377 
6378 }
6379 
6380 /*----------------------------------------------------------------------------
6381 | Returns 1 if the extended double-precision floating-point value `a' is
6382 | less than the corresponding value `b', and 0 otherwise.  The invalid
6383 | exception is raised if either operand is a NaN.  The comparison is performed
6384 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6385 *----------------------------------------------------------------------------*/
6386 
6387 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6388 {
6389     flag aSign, bSign;
6390 
6391     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6392         || (extractFloatx80Exp(a) == 0x7FFF
6393             && (uint64_t) (extractFloatx80Frac(a) << 1))
6394         || (extractFloatx80Exp(b) == 0x7FFF
6395             && (uint64_t) (extractFloatx80Frac(b) << 1))
6396        ) {
6397         float_raise(float_flag_invalid, status);
6398         return 0;
6399     }
6400     aSign = extractFloatx80Sign( a );
6401     bSign = extractFloatx80Sign( b );
6402     if ( aSign != bSign ) {
6403         return
6404                aSign
6405             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6406                  != 0 );
6407     }
6408     return
6409           aSign ? lt128( b.high, b.low, a.high, a.low )
6410         : lt128( a.high, a.low, b.high, b.low );
6411 
6412 }
6413 
6414 /*----------------------------------------------------------------------------
6415 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6416 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6417 | either operand is a NaN.   The comparison is performed according to the
6418 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6419 *----------------------------------------------------------------------------*/
6420 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6421 {
6422     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6423         || (extractFloatx80Exp(a) == 0x7FFF
6424             && (uint64_t) (extractFloatx80Frac(a) << 1))
6425         || (extractFloatx80Exp(b) == 0x7FFF
6426             && (uint64_t) (extractFloatx80Frac(b) << 1))
6427        ) {
6428         float_raise(float_flag_invalid, status);
6429         return 1;
6430     }
6431     return 0;
6432 }
6433 
6434 /*----------------------------------------------------------------------------
6435 | Returns 1 if the extended double-precision floating-point value `a' is
6436 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6437 | cause an exception.  The comparison is performed according to the IEC/IEEE
6438 | Standard for Binary Floating-Point Arithmetic.
6439 *----------------------------------------------------------------------------*/
6440 
6441 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6442 {
6443 
6444     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6445         float_raise(float_flag_invalid, status);
6446         return 0;
6447     }
6448     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6449               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6450          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6451               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6452        ) {
6453         if (floatx80_is_signaling_nan(a, status)
6454          || floatx80_is_signaling_nan(b, status)) {
6455             float_raise(float_flag_invalid, status);
6456         }
6457         return 0;
6458     }
6459     return
6460            ( a.low == b.low )
6461         && (    ( a.high == b.high )
6462              || (    ( a.low == 0 )
6463                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6464            );
6465 
6466 }
6467 
6468 /*----------------------------------------------------------------------------
6469 | Returns 1 if the extended double-precision floating-point value `a' is less
6470 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6471 | do not cause an exception.  Otherwise, the comparison is performed according
6472 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6473 *----------------------------------------------------------------------------*/
6474 
6475 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6476 {
6477     flag aSign, bSign;
6478 
6479     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6480         float_raise(float_flag_invalid, status);
6481         return 0;
6482     }
6483     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6484               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6485          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6486               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6487        ) {
6488         if (floatx80_is_signaling_nan(a, status)
6489          || floatx80_is_signaling_nan(b, status)) {
6490             float_raise(float_flag_invalid, status);
6491         }
6492         return 0;
6493     }
6494     aSign = extractFloatx80Sign( a );
6495     bSign = extractFloatx80Sign( b );
6496     if ( aSign != bSign ) {
6497         return
6498                aSign
6499             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6500                  == 0 );
6501     }
6502     return
6503           aSign ? le128( b.high, b.low, a.high, a.low )
6504         : le128( a.high, a.low, b.high, b.low );
6505 
6506 }
6507 
6508 /*----------------------------------------------------------------------------
6509 | Returns 1 if the extended double-precision floating-point value `a' is less
6510 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6511 | an exception.  Otherwise, the comparison is performed according to the
6512 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6513 *----------------------------------------------------------------------------*/
6514 
6515 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6516 {
6517     flag aSign, bSign;
6518 
6519     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6520         float_raise(float_flag_invalid, status);
6521         return 0;
6522     }
6523     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6524               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6525          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6526               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6527        ) {
6528         if (floatx80_is_signaling_nan(a, status)
6529          || floatx80_is_signaling_nan(b, status)) {
6530             float_raise(float_flag_invalid, status);
6531         }
6532         return 0;
6533     }
6534     aSign = extractFloatx80Sign( a );
6535     bSign = extractFloatx80Sign( b );
6536     if ( aSign != bSign ) {
6537         return
6538                aSign
6539             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6540                  != 0 );
6541     }
6542     return
6543           aSign ? lt128( b.high, b.low, a.high, a.low )
6544         : lt128( a.high, a.low, b.high, b.low );
6545 
6546 }
6547 
6548 /*----------------------------------------------------------------------------
6549 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6550 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6551 | The comparison is performed according to the IEC/IEEE Standard for Binary
6552 | Floating-Point Arithmetic.
6553 *----------------------------------------------------------------------------*/
6554 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6555 {
6556     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6557         float_raise(float_flag_invalid, status);
6558         return 1;
6559     }
6560     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6561               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6562          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6563               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6564        ) {
6565         if (floatx80_is_signaling_nan(a, status)
6566          || floatx80_is_signaling_nan(b, status)) {
6567             float_raise(float_flag_invalid, status);
6568         }
6569         return 1;
6570     }
6571     return 0;
6572 }
6573 
6574 /*----------------------------------------------------------------------------
6575 | Returns the result of converting the quadruple-precision floating-point
6576 | value `a' to the 32-bit two's complement integer format.  The conversion
6577 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6578 | Arithmetic---which means in particular that the conversion is rounded
6579 | according to the current rounding mode.  If `a' is a NaN, the largest
6580 | positive integer is returned.  Otherwise, if the conversion overflows, the
6581 | largest integer with the same sign as `a' is returned.
6582 *----------------------------------------------------------------------------*/
6583 
6584 int32_t float128_to_int32(float128 a, float_status *status)
6585 {
6586     flag aSign;
6587     int32_t aExp, shiftCount;
6588     uint64_t aSig0, aSig1;
6589 
6590     aSig1 = extractFloat128Frac1( a );
6591     aSig0 = extractFloat128Frac0( a );
6592     aExp = extractFloat128Exp( a );
6593     aSign = extractFloat128Sign( a );
6594     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6595     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6596     aSig0 |= ( aSig1 != 0 );
6597     shiftCount = 0x4028 - aExp;
6598     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6599     return roundAndPackInt32(aSign, aSig0, status);
6600 
6601 }
6602 
6603 /*----------------------------------------------------------------------------
6604 | Returns the result of converting the quadruple-precision floating-point
6605 | value `a' to the 32-bit two's complement integer format.  The conversion
6606 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6607 | Arithmetic, except that the conversion is always rounded toward zero.  If
6608 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6609 | conversion overflows, the largest integer with the same sign as `a' is
6610 | returned.
6611 *----------------------------------------------------------------------------*/
6612 
6613 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6614 {
6615     flag aSign;
6616     int32_t aExp, shiftCount;
6617     uint64_t aSig0, aSig1, savedASig;
6618     int32_t z;
6619 
6620     aSig1 = extractFloat128Frac1( a );
6621     aSig0 = extractFloat128Frac0( a );
6622     aExp = extractFloat128Exp( a );
6623     aSign = extractFloat128Sign( a );
6624     aSig0 |= ( aSig1 != 0 );
6625     if ( 0x401E < aExp ) {
6626         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6627         goto invalid;
6628     }
6629     else if ( aExp < 0x3FFF ) {
6630         if (aExp || aSig0) {
6631             status->float_exception_flags |= float_flag_inexact;
6632         }
6633         return 0;
6634     }
6635     aSig0 |= UINT64_C(0x0001000000000000);
6636     shiftCount = 0x402F - aExp;
6637     savedASig = aSig0;
6638     aSig0 >>= shiftCount;
6639     z = aSig0;
6640     if ( aSign ) z = - z;
6641     if ( ( z < 0 ) ^ aSign ) {
6642  invalid:
6643         float_raise(float_flag_invalid, status);
6644         return aSign ? INT32_MIN : INT32_MAX;
6645     }
6646     if ( ( aSig0<<shiftCount ) != savedASig ) {
6647         status->float_exception_flags |= float_flag_inexact;
6648     }
6649     return z;
6650 
6651 }
6652 
6653 /*----------------------------------------------------------------------------
6654 | Returns the result of converting the quadruple-precision floating-point
6655 | value `a' to the 64-bit two's complement integer format.  The conversion
6656 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6657 | Arithmetic---which means in particular that the conversion is rounded
6658 | according to the current rounding mode.  If `a' is a NaN, the largest
6659 | positive integer is returned.  Otherwise, if the conversion overflows, the
6660 | largest integer with the same sign as `a' is returned.
6661 *----------------------------------------------------------------------------*/
6662 
6663 int64_t float128_to_int64(float128 a, float_status *status)
6664 {
6665     flag aSign;
6666     int32_t aExp, shiftCount;
6667     uint64_t aSig0, aSig1;
6668 
6669     aSig1 = extractFloat128Frac1( a );
6670     aSig0 = extractFloat128Frac0( a );
6671     aExp = extractFloat128Exp( a );
6672     aSign = extractFloat128Sign( a );
6673     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6674     shiftCount = 0x402F - aExp;
6675     if ( shiftCount <= 0 ) {
6676         if ( 0x403E < aExp ) {
6677             float_raise(float_flag_invalid, status);
6678             if (    ! aSign
6679                  || (    ( aExp == 0x7FFF )
6680                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6681                     )
6682                ) {
6683                 return INT64_MAX;
6684             }
6685             return INT64_MIN;
6686         }
6687         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6688     }
6689     else {
6690         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6691     }
6692     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6693 
6694 }
6695 
6696 /*----------------------------------------------------------------------------
6697 | Returns the result of converting the quadruple-precision floating-point
6698 | value `a' to the 64-bit two's complement integer format.  The conversion
6699 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6700 | Arithmetic, except that the conversion is always rounded toward zero.
6701 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6702 | the conversion overflows, the largest integer with the same sign as `a' is
6703 | returned.
6704 *----------------------------------------------------------------------------*/
6705 
6706 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6707 {
6708     flag aSign;
6709     int32_t aExp, shiftCount;
6710     uint64_t aSig0, aSig1;
6711     int64_t z;
6712 
6713     aSig1 = extractFloat128Frac1( a );
6714     aSig0 = extractFloat128Frac0( a );
6715     aExp = extractFloat128Exp( a );
6716     aSign = extractFloat128Sign( a );
6717     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6718     shiftCount = aExp - 0x402F;
6719     if ( 0 < shiftCount ) {
6720         if ( 0x403E <= aExp ) {
6721             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6722             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6723                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6724                 if (aSig1) {
6725                     status->float_exception_flags |= float_flag_inexact;
6726                 }
6727             }
6728             else {
6729                 float_raise(float_flag_invalid, status);
6730                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6731                     return INT64_MAX;
6732                 }
6733             }
6734             return INT64_MIN;
6735         }
6736         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6737         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6738             status->float_exception_flags |= float_flag_inexact;
6739         }
6740     }
6741     else {
6742         if ( aExp < 0x3FFF ) {
6743             if ( aExp | aSig0 | aSig1 ) {
6744                 status->float_exception_flags |= float_flag_inexact;
6745             }
6746             return 0;
6747         }
6748         z = aSig0>>( - shiftCount );
6749         if (    aSig1
6750              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6751             status->float_exception_flags |= float_flag_inexact;
6752         }
6753     }
6754     if ( aSign ) z = - z;
6755     return z;
6756 
6757 }
6758 
6759 /*----------------------------------------------------------------------------
6760 | Returns the result of converting the quadruple-precision floating-point value
6761 | `a' to the 64-bit unsigned integer format.  The conversion is
6762 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6763 | Arithmetic---which means in particular that the conversion is rounded
6764 | according to the current rounding mode.  If `a' is a NaN, the largest
6765 | positive integer is returned.  If the conversion overflows, the
6766 | largest unsigned integer is returned.  If 'a' is negative, the value is
6767 | rounded and zero is returned; negative values that do not round to zero
6768 | will raise the inexact exception.
6769 *----------------------------------------------------------------------------*/
6770 
6771 uint64_t float128_to_uint64(float128 a, float_status *status)
6772 {
6773     flag aSign;
6774     int aExp;
6775     int shiftCount;
6776     uint64_t aSig0, aSig1;
6777 
6778     aSig0 = extractFloat128Frac0(a);
6779     aSig1 = extractFloat128Frac1(a);
6780     aExp = extractFloat128Exp(a);
6781     aSign = extractFloat128Sign(a);
6782     if (aSign && (aExp > 0x3FFE)) {
6783         float_raise(float_flag_invalid, status);
6784         if (float128_is_any_nan(a)) {
6785             return UINT64_MAX;
6786         } else {
6787             return 0;
6788         }
6789     }
6790     if (aExp) {
6791         aSig0 |= UINT64_C(0x0001000000000000);
6792     }
6793     shiftCount = 0x402F - aExp;
6794     if (shiftCount <= 0) {
6795         if (0x403E < aExp) {
6796             float_raise(float_flag_invalid, status);
6797             return UINT64_MAX;
6798         }
6799         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6800     } else {
6801         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6802     }
6803     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6804 }
6805 
6806 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6807 {
6808     uint64_t v;
6809     signed char current_rounding_mode = status->float_rounding_mode;
6810 
6811     set_float_rounding_mode(float_round_to_zero, status);
6812     v = float128_to_uint64(a, status);
6813     set_float_rounding_mode(current_rounding_mode, status);
6814 
6815     return v;
6816 }
6817 
6818 /*----------------------------------------------------------------------------
6819 | Returns the result of converting the quadruple-precision floating-point
6820 | value `a' to the 32-bit unsigned integer format.  The conversion
6821 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6822 | Arithmetic except that the conversion is always rounded toward zero.
6823 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6824 | if the conversion overflows, the largest unsigned integer is returned.
6825 | If 'a' is negative, the value is rounded and zero is returned; negative
6826 | values that do not round to zero will raise the inexact exception.
6827 *----------------------------------------------------------------------------*/
6828 
6829 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6830 {
6831     uint64_t v;
6832     uint32_t res;
6833     int old_exc_flags = get_float_exception_flags(status);
6834 
6835     v = float128_to_uint64_round_to_zero(a, status);
6836     if (v > 0xffffffff) {
6837         res = 0xffffffff;
6838     } else {
6839         return v;
6840     }
6841     set_float_exception_flags(old_exc_flags, status);
6842     float_raise(float_flag_invalid, status);
6843     return res;
6844 }
6845 
6846 /*----------------------------------------------------------------------------
6847 | Returns the result of converting the quadruple-precision floating-point value
6848 | `a' to the 32-bit unsigned integer format.  The conversion is
6849 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6850 | Arithmetic---which means in particular that the conversion is rounded
6851 | according to the current rounding mode.  If `a' is a NaN, the largest
6852 | positive integer is returned.  If the conversion overflows, the
6853 | largest unsigned integer is returned.  If 'a' is negative, the value is
6854 | rounded and zero is returned; negative values that do not round to zero
6855 | will raise the inexact exception.
6856 *----------------------------------------------------------------------------*/
6857 
6858 uint32_t float128_to_uint32(float128 a, float_status *status)
6859 {
6860     uint64_t v;
6861     uint32_t res;
6862     int old_exc_flags = get_float_exception_flags(status);
6863 
6864     v = float128_to_uint64(a, status);
6865     if (v > 0xffffffff) {
6866         res = 0xffffffff;
6867     } else {
6868         return v;
6869     }
6870     set_float_exception_flags(old_exc_flags, status);
6871     float_raise(float_flag_invalid, status);
6872     return res;
6873 }
6874 
6875 /*----------------------------------------------------------------------------
6876 | Returns the result of converting the quadruple-precision floating-point
6877 | value `a' to the single-precision floating-point format.  The conversion
6878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6879 | Arithmetic.
6880 *----------------------------------------------------------------------------*/
6881 
6882 float32 float128_to_float32(float128 a, float_status *status)
6883 {
6884     flag aSign;
6885     int32_t aExp;
6886     uint64_t aSig0, aSig1;
6887     uint32_t zSig;
6888 
6889     aSig1 = extractFloat128Frac1( a );
6890     aSig0 = extractFloat128Frac0( a );
6891     aExp = extractFloat128Exp( a );
6892     aSign = extractFloat128Sign( a );
6893     if ( aExp == 0x7FFF ) {
6894         if ( aSig0 | aSig1 ) {
6895             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6896         }
6897         return packFloat32( aSign, 0xFF, 0 );
6898     }
6899     aSig0 |= ( aSig1 != 0 );
6900     shift64RightJamming( aSig0, 18, &aSig0 );
6901     zSig = aSig0;
6902     if ( aExp || zSig ) {
6903         zSig |= 0x40000000;
6904         aExp -= 0x3F81;
6905     }
6906     return roundAndPackFloat32(aSign, aExp, zSig, status);
6907 
6908 }
6909 
6910 /*----------------------------------------------------------------------------
6911 | Returns the result of converting the quadruple-precision floating-point
6912 | value `a' to the double-precision floating-point format.  The conversion
6913 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6914 | Arithmetic.
6915 *----------------------------------------------------------------------------*/
6916 
6917 float64 float128_to_float64(float128 a, float_status *status)
6918 {
6919     flag aSign;
6920     int32_t aExp;
6921     uint64_t aSig0, aSig1;
6922 
6923     aSig1 = extractFloat128Frac1( a );
6924     aSig0 = extractFloat128Frac0( a );
6925     aExp = extractFloat128Exp( a );
6926     aSign = extractFloat128Sign( a );
6927     if ( aExp == 0x7FFF ) {
6928         if ( aSig0 | aSig1 ) {
6929             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6930         }
6931         return packFloat64( aSign, 0x7FF, 0 );
6932     }
6933     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6934     aSig0 |= ( aSig1 != 0 );
6935     if ( aExp || aSig0 ) {
6936         aSig0 |= UINT64_C(0x4000000000000000);
6937         aExp -= 0x3C01;
6938     }
6939     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6940 
6941 }
6942 
6943 /*----------------------------------------------------------------------------
6944 | Returns the result of converting the quadruple-precision floating-point
6945 | value `a' to the extended double-precision floating-point format.  The
6946 | conversion is performed according to the IEC/IEEE Standard for Binary
6947 | Floating-Point Arithmetic.
6948 *----------------------------------------------------------------------------*/
6949 
6950 floatx80 float128_to_floatx80(float128 a, float_status *status)
6951 {
6952     flag aSign;
6953     int32_t aExp;
6954     uint64_t aSig0, aSig1;
6955 
6956     aSig1 = extractFloat128Frac1( a );
6957     aSig0 = extractFloat128Frac0( a );
6958     aExp = extractFloat128Exp( a );
6959     aSign = extractFloat128Sign( a );
6960     if ( aExp == 0x7FFF ) {
6961         if ( aSig0 | aSig1 ) {
6962             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6963         }
6964         return packFloatx80(aSign, floatx80_infinity_high,
6965                                    floatx80_infinity_low);
6966     }
6967     if ( aExp == 0 ) {
6968         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6969         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6970     }
6971     else {
6972         aSig0 |= UINT64_C(0x0001000000000000);
6973     }
6974     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6975     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6976 
6977 }
6978 
6979 /*----------------------------------------------------------------------------
6980 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6981 | returns the result as a quadruple-precision floating-point value.  The
6982 | operation is performed according to the IEC/IEEE Standard for Binary
6983 | Floating-Point Arithmetic.
6984 *----------------------------------------------------------------------------*/
6985 
6986 float128 float128_round_to_int(float128 a, float_status *status)
6987 {
6988     flag aSign;
6989     int32_t aExp;
6990     uint64_t lastBitMask, roundBitsMask;
6991     float128 z;
6992 
6993     aExp = extractFloat128Exp( a );
6994     if ( 0x402F <= aExp ) {
6995         if ( 0x406F <= aExp ) {
6996             if (    ( aExp == 0x7FFF )
6997                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6998                ) {
6999                 return propagateFloat128NaN(a, a, status);
7000             }
7001             return a;
7002         }
7003         lastBitMask = 1;
7004         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7005         roundBitsMask = lastBitMask - 1;
7006         z = a;
7007         switch (status->float_rounding_mode) {
7008         case float_round_nearest_even:
7009             if ( lastBitMask ) {
7010                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7011                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7012             }
7013             else {
7014                 if ( (int64_t) z.low < 0 ) {
7015                     ++z.high;
7016                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7017                 }
7018             }
7019             break;
7020         case float_round_ties_away:
7021             if (lastBitMask) {
7022                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7023             } else {
7024                 if ((int64_t) z.low < 0) {
7025                     ++z.high;
7026                 }
7027             }
7028             break;
7029         case float_round_to_zero:
7030             break;
7031         case float_round_up:
7032             if (!extractFloat128Sign(z)) {
7033                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7034             }
7035             break;
7036         case float_round_down:
7037             if (extractFloat128Sign(z)) {
7038                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7039             }
7040             break;
7041         case float_round_to_odd:
7042             /*
7043              * Note that if lastBitMask == 0, the last bit is the lsb
7044              * of high, and roundBitsMask == -1.
7045              */
7046             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7047                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7048             }
7049             break;
7050         default:
7051             abort();
7052         }
7053         z.low &= ~ roundBitsMask;
7054     }
7055     else {
7056         if ( aExp < 0x3FFF ) {
7057             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7058             status->float_exception_flags |= float_flag_inexact;
7059             aSign = extractFloat128Sign( a );
7060             switch (status->float_rounding_mode) {
7061             case float_round_nearest_even:
7062                 if (    ( aExp == 0x3FFE )
7063                      && (   extractFloat128Frac0( a )
7064                           | extractFloat128Frac1( a ) )
7065                    ) {
7066                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7067                 }
7068                 break;
7069             case float_round_ties_away:
7070                 if (aExp == 0x3FFE) {
7071                     return packFloat128(aSign, 0x3FFF, 0, 0);
7072                 }
7073                 break;
7074             case float_round_down:
7075                 return
7076                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7077                     : packFloat128( 0, 0, 0, 0 );
7078             case float_round_up:
7079                 return
7080                       aSign ? packFloat128( 1, 0, 0, 0 )
7081                     : packFloat128( 0, 0x3FFF, 0, 0 );
7082 
7083             case float_round_to_odd:
7084                 return packFloat128(aSign, 0x3FFF, 0, 0);
7085             }
7086             return packFloat128( aSign, 0, 0, 0 );
7087         }
7088         lastBitMask = 1;
7089         lastBitMask <<= 0x402F - aExp;
7090         roundBitsMask = lastBitMask - 1;
7091         z.low = 0;
7092         z.high = a.high;
7093         switch (status->float_rounding_mode) {
7094         case float_round_nearest_even:
7095             z.high += lastBitMask>>1;
7096             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7097                 z.high &= ~ lastBitMask;
7098             }
7099             break;
7100         case float_round_ties_away:
7101             z.high += lastBitMask>>1;
7102             break;
7103         case float_round_to_zero:
7104             break;
7105         case float_round_up:
7106             if (!extractFloat128Sign(z)) {
7107                 z.high |= ( a.low != 0 );
7108                 z.high += roundBitsMask;
7109             }
7110             break;
7111         case float_round_down:
7112             if (extractFloat128Sign(z)) {
7113                 z.high |= (a.low != 0);
7114                 z.high += roundBitsMask;
7115             }
7116             break;
7117         case float_round_to_odd:
7118             if ((z.high & lastBitMask) == 0) {
7119                 z.high |= (a.low != 0);
7120                 z.high += roundBitsMask;
7121             }
7122             break;
7123         default:
7124             abort();
7125         }
7126         z.high &= ~ roundBitsMask;
7127     }
7128     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7129         status->float_exception_flags |= float_flag_inexact;
7130     }
7131     return z;
7132 
7133 }
7134 
7135 /*----------------------------------------------------------------------------
7136 | Returns the result of adding the absolute values of the quadruple-precision
7137 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7138 | before being returned.  `zSign' is ignored if the result is a NaN.
7139 | The addition is performed according to the IEC/IEEE Standard for Binary
7140 | Floating-Point Arithmetic.
7141 *----------------------------------------------------------------------------*/
7142 
7143 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7144                                 float_status *status)
7145 {
7146     int32_t aExp, bExp, zExp;
7147     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7148     int32_t expDiff;
7149 
7150     aSig1 = extractFloat128Frac1( a );
7151     aSig0 = extractFloat128Frac0( a );
7152     aExp = extractFloat128Exp( a );
7153     bSig1 = extractFloat128Frac1( b );
7154     bSig0 = extractFloat128Frac0( b );
7155     bExp = extractFloat128Exp( b );
7156     expDiff = aExp - bExp;
7157     if ( 0 < expDiff ) {
7158         if ( aExp == 0x7FFF ) {
7159             if (aSig0 | aSig1) {
7160                 return propagateFloat128NaN(a, b, status);
7161             }
7162             return a;
7163         }
7164         if ( bExp == 0 ) {
7165             --expDiff;
7166         }
7167         else {
7168             bSig0 |= UINT64_C(0x0001000000000000);
7169         }
7170         shift128ExtraRightJamming(
7171             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7172         zExp = aExp;
7173     }
7174     else if ( expDiff < 0 ) {
7175         if ( bExp == 0x7FFF ) {
7176             if (bSig0 | bSig1) {
7177                 return propagateFloat128NaN(a, b, status);
7178             }
7179             return packFloat128( zSign, 0x7FFF, 0, 0 );
7180         }
7181         if ( aExp == 0 ) {
7182             ++expDiff;
7183         }
7184         else {
7185             aSig0 |= UINT64_C(0x0001000000000000);
7186         }
7187         shift128ExtraRightJamming(
7188             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7189         zExp = bExp;
7190     }
7191     else {
7192         if ( aExp == 0x7FFF ) {
7193             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7194                 return propagateFloat128NaN(a, b, status);
7195             }
7196             return a;
7197         }
7198         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7199         if ( aExp == 0 ) {
7200             if (status->flush_to_zero) {
7201                 if (zSig0 | zSig1) {
7202                     float_raise(float_flag_output_denormal, status);
7203                 }
7204                 return packFloat128(zSign, 0, 0, 0);
7205             }
7206             return packFloat128( zSign, 0, zSig0, zSig1 );
7207         }
7208         zSig2 = 0;
7209         zSig0 |= UINT64_C(0x0002000000000000);
7210         zExp = aExp;
7211         goto shiftRight1;
7212     }
7213     aSig0 |= UINT64_C(0x0001000000000000);
7214     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7215     --zExp;
7216     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7217     ++zExp;
7218  shiftRight1:
7219     shift128ExtraRightJamming(
7220         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7221  roundAndPack:
7222     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7223 
7224 }
7225 
7226 /*----------------------------------------------------------------------------
7227 | Returns the result of subtracting the absolute values of the quadruple-
7228 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7229 | difference is negated before being returned.  `zSign' is ignored if the
7230 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7231 | Standard for Binary Floating-Point Arithmetic.
7232 *----------------------------------------------------------------------------*/
7233 
7234 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7235                                 float_status *status)
7236 {
7237     int32_t aExp, bExp, zExp;
7238     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7239     int32_t expDiff;
7240 
7241     aSig1 = extractFloat128Frac1( a );
7242     aSig0 = extractFloat128Frac0( a );
7243     aExp = extractFloat128Exp( a );
7244     bSig1 = extractFloat128Frac1( b );
7245     bSig0 = extractFloat128Frac0( b );
7246     bExp = extractFloat128Exp( b );
7247     expDiff = aExp - bExp;
7248     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7249     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7250     if ( 0 < expDiff ) goto aExpBigger;
7251     if ( expDiff < 0 ) goto bExpBigger;
7252     if ( aExp == 0x7FFF ) {
7253         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7254             return propagateFloat128NaN(a, b, status);
7255         }
7256         float_raise(float_flag_invalid, status);
7257         return float128_default_nan(status);
7258     }
7259     if ( aExp == 0 ) {
7260         aExp = 1;
7261         bExp = 1;
7262     }
7263     if ( bSig0 < aSig0 ) goto aBigger;
7264     if ( aSig0 < bSig0 ) goto bBigger;
7265     if ( bSig1 < aSig1 ) goto aBigger;
7266     if ( aSig1 < bSig1 ) goto bBigger;
7267     return packFloat128(status->float_rounding_mode == float_round_down,
7268                         0, 0, 0);
7269  bExpBigger:
7270     if ( bExp == 0x7FFF ) {
7271         if (bSig0 | bSig1) {
7272             return propagateFloat128NaN(a, b, status);
7273         }
7274         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7275     }
7276     if ( aExp == 0 ) {
7277         ++expDiff;
7278     }
7279     else {
7280         aSig0 |= UINT64_C(0x4000000000000000);
7281     }
7282     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7283     bSig0 |= UINT64_C(0x4000000000000000);
7284  bBigger:
7285     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7286     zExp = bExp;
7287     zSign ^= 1;
7288     goto normalizeRoundAndPack;
7289  aExpBigger:
7290     if ( aExp == 0x7FFF ) {
7291         if (aSig0 | aSig1) {
7292             return propagateFloat128NaN(a, b, status);
7293         }
7294         return a;
7295     }
7296     if ( bExp == 0 ) {
7297         --expDiff;
7298     }
7299     else {
7300         bSig0 |= UINT64_C(0x4000000000000000);
7301     }
7302     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7303     aSig0 |= UINT64_C(0x4000000000000000);
7304  aBigger:
7305     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7306     zExp = aExp;
7307  normalizeRoundAndPack:
7308     --zExp;
7309     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7310                                          status);
7311 
7312 }
7313 
7314 /*----------------------------------------------------------------------------
7315 | Returns the result of adding the quadruple-precision floating-point values
7316 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7317 | for Binary Floating-Point Arithmetic.
7318 *----------------------------------------------------------------------------*/
7319 
7320 float128 float128_add(float128 a, float128 b, float_status *status)
7321 {
7322     flag aSign, bSign;
7323 
7324     aSign = extractFloat128Sign( a );
7325     bSign = extractFloat128Sign( b );
7326     if ( aSign == bSign ) {
7327         return addFloat128Sigs(a, b, aSign, status);
7328     }
7329     else {
7330         return subFloat128Sigs(a, b, aSign, status);
7331     }
7332 
7333 }
7334 
7335 /*----------------------------------------------------------------------------
7336 | Returns the result of subtracting the quadruple-precision floating-point
7337 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7338 | Standard for Binary Floating-Point Arithmetic.
7339 *----------------------------------------------------------------------------*/
7340 
7341 float128 float128_sub(float128 a, float128 b, float_status *status)
7342 {
7343     flag aSign, bSign;
7344 
7345     aSign = extractFloat128Sign( a );
7346     bSign = extractFloat128Sign( b );
7347     if ( aSign == bSign ) {
7348         return subFloat128Sigs(a, b, aSign, status);
7349     }
7350     else {
7351         return addFloat128Sigs(a, b, aSign, status);
7352     }
7353 
7354 }
7355 
7356 /*----------------------------------------------------------------------------
7357 | Returns the result of multiplying the quadruple-precision floating-point
7358 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7359 | Standard for Binary Floating-Point Arithmetic.
7360 *----------------------------------------------------------------------------*/
7361 
7362 float128 float128_mul(float128 a, float128 b, float_status *status)
7363 {
7364     flag aSign, bSign, zSign;
7365     int32_t aExp, bExp, zExp;
7366     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7367 
7368     aSig1 = extractFloat128Frac1( a );
7369     aSig0 = extractFloat128Frac0( a );
7370     aExp = extractFloat128Exp( a );
7371     aSign = extractFloat128Sign( a );
7372     bSig1 = extractFloat128Frac1( b );
7373     bSig0 = extractFloat128Frac0( b );
7374     bExp = extractFloat128Exp( b );
7375     bSign = extractFloat128Sign( b );
7376     zSign = aSign ^ bSign;
7377     if ( aExp == 0x7FFF ) {
7378         if (    ( aSig0 | aSig1 )
7379              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7380             return propagateFloat128NaN(a, b, status);
7381         }
7382         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7383         return packFloat128( zSign, 0x7FFF, 0, 0 );
7384     }
7385     if ( bExp == 0x7FFF ) {
7386         if (bSig0 | bSig1) {
7387             return propagateFloat128NaN(a, b, status);
7388         }
7389         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7390  invalid:
7391             float_raise(float_flag_invalid, status);
7392             return float128_default_nan(status);
7393         }
7394         return packFloat128( zSign, 0x7FFF, 0, 0 );
7395     }
7396     if ( aExp == 0 ) {
7397         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7398         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7399     }
7400     if ( bExp == 0 ) {
7401         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7402         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7403     }
7404     zExp = aExp + bExp - 0x4000;
7405     aSig0 |= UINT64_C(0x0001000000000000);
7406     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7407     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7408     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7409     zSig2 |= ( zSig3 != 0 );
7410     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7411         shift128ExtraRightJamming(
7412             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7413         ++zExp;
7414     }
7415     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7416 
7417 }
7418 
7419 /*----------------------------------------------------------------------------
7420 | Returns the result of dividing the quadruple-precision floating-point value
7421 | `a' by the corresponding value `b'.  The operation is performed according to
7422 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7423 *----------------------------------------------------------------------------*/
7424 
7425 float128 float128_div(float128 a, float128 b, float_status *status)
7426 {
7427     flag aSign, bSign, zSign;
7428     int32_t aExp, bExp, zExp;
7429     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7430     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7431 
7432     aSig1 = extractFloat128Frac1( a );
7433     aSig0 = extractFloat128Frac0( a );
7434     aExp = extractFloat128Exp( a );
7435     aSign = extractFloat128Sign( a );
7436     bSig1 = extractFloat128Frac1( b );
7437     bSig0 = extractFloat128Frac0( b );
7438     bExp = extractFloat128Exp( b );
7439     bSign = extractFloat128Sign( b );
7440     zSign = aSign ^ bSign;
7441     if ( aExp == 0x7FFF ) {
7442         if (aSig0 | aSig1) {
7443             return propagateFloat128NaN(a, b, status);
7444         }
7445         if ( bExp == 0x7FFF ) {
7446             if (bSig0 | bSig1) {
7447                 return propagateFloat128NaN(a, b, status);
7448             }
7449             goto invalid;
7450         }
7451         return packFloat128( zSign, 0x7FFF, 0, 0 );
7452     }
7453     if ( bExp == 0x7FFF ) {
7454         if (bSig0 | bSig1) {
7455             return propagateFloat128NaN(a, b, status);
7456         }
7457         return packFloat128( zSign, 0, 0, 0 );
7458     }
7459     if ( bExp == 0 ) {
7460         if ( ( bSig0 | bSig1 ) == 0 ) {
7461             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7462  invalid:
7463                 float_raise(float_flag_invalid, status);
7464                 return float128_default_nan(status);
7465             }
7466             float_raise(float_flag_divbyzero, status);
7467             return packFloat128( zSign, 0x7FFF, 0, 0 );
7468         }
7469         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7470     }
7471     if ( aExp == 0 ) {
7472         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7473         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7474     }
7475     zExp = aExp - bExp + 0x3FFD;
7476     shortShift128Left(
7477         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7478     shortShift128Left(
7479         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7480     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7481         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7482         ++zExp;
7483     }
7484     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7485     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7486     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7487     while ( (int64_t) rem0 < 0 ) {
7488         --zSig0;
7489         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7490     }
7491     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7492     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7493         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7494         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7495         while ( (int64_t) rem1 < 0 ) {
7496             --zSig1;
7497             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7498         }
7499         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7500     }
7501     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7502     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7503 
7504 }
7505 
7506 /*----------------------------------------------------------------------------
7507 | Returns the remainder of the quadruple-precision floating-point value `a'
7508 | with respect to the corresponding value `b'.  The operation is performed
7509 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7510 *----------------------------------------------------------------------------*/
7511 
7512 float128 float128_rem(float128 a, float128 b, float_status *status)
7513 {
7514     flag aSign, zSign;
7515     int32_t aExp, bExp, expDiff;
7516     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7517     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7518     int64_t sigMean0;
7519 
7520     aSig1 = extractFloat128Frac1( a );
7521     aSig0 = extractFloat128Frac0( a );
7522     aExp = extractFloat128Exp( a );
7523     aSign = extractFloat128Sign( a );
7524     bSig1 = extractFloat128Frac1( b );
7525     bSig0 = extractFloat128Frac0( b );
7526     bExp = extractFloat128Exp( b );
7527     if ( aExp == 0x7FFF ) {
7528         if (    ( aSig0 | aSig1 )
7529              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7530             return propagateFloat128NaN(a, b, status);
7531         }
7532         goto invalid;
7533     }
7534     if ( bExp == 0x7FFF ) {
7535         if (bSig0 | bSig1) {
7536             return propagateFloat128NaN(a, b, status);
7537         }
7538         return a;
7539     }
7540     if ( bExp == 0 ) {
7541         if ( ( bSig0 | bSig1 ) == 0 ) {
7542  invalid:
7543             float_raise(float_flag_invalid, status);
7544             return float128_default_nan(status);
7545         }
7546         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7547     }
7548     if ( aExp == 0 ) {
7549         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7550         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7551     }
7552     expDiff = aExp - bExp;
7553     if ( expDiff < -1 ) return a;
7554     shortShift128Left(
7555         aSig0 | UINT64_C(0x0001000000000000),
7556         aSig1,
7557         15 - ( expDiff < 0 ),
7558         &aSig0,
7559         &aSig1
7560     );
7561     shortShift128Left(
7562         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7563     q = le128( bSig0, bSig1, aSig0, aSig1 );
7564     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7565     expDiff -= 64;
7566     while ( 0 < expDiff ) {
7567         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7568         q = ( 4 < q ) ? q - 4 : 0;
7569         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7570         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7571         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7572         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7573         expDiff -= 61;
7574     }
7575     if ( -64 < expDiff ) {
7576         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7577         q = ( 4 < q ) ? q - 4 : 0;
7578         q >>= - expDiff;
7579         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7580         expDiff += 52;
7581         if ( expDiff < 0 ) {
7582             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7583         }
7584         else {
7585             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7586         }
7587         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7588         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7589     }
7590     else {
7591         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7592         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7593     }
7594     do {
7595         alternateASig0 = aSig0;
7596         alternateASig1 = aSig1;
7597         ++q;
7598         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7599     } while ( 0 <= (int64_t) aSig0 );
7600     add128(
7601         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7602     if (    ( sigMean0 < 0 )
7603          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7604         aSig0 = alternateASig0;
7605         aSig1 = alternateASig1;
7606     }
7607     zSign = ( (int64_t) aSig0 < 0 );
7608     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7609     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7610                                          status);
7611 }
7612 
7613 /*----------------------------------------------------------------------------
7614 | Returns the square root of the quadruple-precision floating-point value `a'.
7615 | The operation is performed according to the IEC/IEEE Standard for Binary
7616 | Floating-Point Arithmetic.
7617 *----------------------------------------------------------------------------*/
7618 
7619 float128 float128_sqrt(float128 a, float_status *status)
7620 {
7621     flag aSign;
7622     int32_t aExp, zExp;
7623     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7624     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7625 
7626     aSig1 = extractFloat128Frac1( a );
7627     aSig0 = extractFloat128Frac0( a );
7628     aExp = extractFloat128Exp( a );
7629     aSign = extractFloat128Sign( a );
7630     if ( aExp == 0x7FFF ) {
7631         if (aSig0 | aSig1) {
7632             return propagateFloat128NaN(a, a, status);
7633         }
7634         if ( ! aSign ) return a;
7635         goto invalid;
7636     }
7637     if ( aSign ) {
7638         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7639  invalid:
7640         float_raise(float_flag_invalid, status);
7641         return float128_default_nan(status);
7642     }
7643     if ( aExp == 0 ) {
7644         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7645         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7646     }
7647     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7648     aSig0 |= UINT64_C(0x0001000000000000);
7649     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7650     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7651     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7652     doubleZSig0 = zSig0<<1;
7653     mul64To128( zSig0, zSig0, &term0, &term1 );
7654     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7655     while ( (int64_t) rem0 < 0 ) {
7656         --zSig0;
7657         doubleZSig0 -= 2;
7658         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7659     }
7660     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7661     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7662         if ( zSig1 == 0 ) zSig1 = 1;
7663         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7664         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7665         mul64To128( zSig1, zSig1, &term2, &term3 );
7666         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7667         while ( (int64_t) rem1 < 0 ) {
7668             --zSig1;
7669             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7670             term3 |= 1;
7671             term2 |= doubleZSig0;
7672             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7673         }
7674         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7675     }
7676     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7677     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7678 
7679 }
7680 
7681 /*----------------------------------------------------------------------------
7682 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7683 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7684 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7685 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7686 *----------------------------------------------------------------------------*/
7687 
7688 int float128_eq(float128 a, float128 b, float_status *status)
7689 {
7690 
7691     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7692               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7693          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7694               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7695        ) {
7696         float_raise(float_flag_invalid, status);
7697         return 0;
7698     }
7699     return
7700            ( a.low == b.low )
7701         && (    ( a.high == b.high )
7702              || (    ( a.low == 0 )
7703                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7704            );
7705 
7706 }
7707 
7708 /*----------------------------------------------------------------------------
7709 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7710 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7711 | exception is raised if either operand is a NaN.  The comparison is performed
7712 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7713 *----------------------------------------------------------------------------*/
7714 
7715 int float128_le(float128 a, float128 b, float_status *status)
7716 {
7717     flag aSign, bSign;
7718 
7719     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7720               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7721          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7722               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7723        ) {
7724         float_raise(float_flag_invalid, status);
7725         return 0;
7726     }
7727     aSign = extractFloat128Sign( a );
7728     bSign = extractFloat128Sign( b );
7729     if ( aSign != bSign ) {
7730         return
7731                aSign
7732             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7733                  == 0 );
7734     }
7735     return
7736           aSign ? le128( b.high, b.low, a.high, a.low )
7737         : le128( a.high, a.low, b.high, b.low );
7738 
7739 }
7740 
7741 /*----------------------------------------------------------------------------
7742 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7743 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7744 | raised if either operand is a NaN.  The comparison is performed according
7745 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7746 *----------------------------------------------------------------------------*/
7747 
7748 int float128_lt(float128 a, float128 b, float_status *status)
7749 {
7750     flag aSign, bSign;
7751 
7752     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7753               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7754          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7755               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7756        ) {
7757         float_raise(float_flag_invalid, status);
7758         return 0;
7759     }
7760     aSign = extractFloat128Sign( a );
7761     bSign = extractFloat128Sign( b );
7762     if ( aSign != bSign ) {
7763         return
7764                aSign
7765             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7766                  != 0 );
7767     }
7768     return
7769           aSign ? lt128( b.high, b.low, a.high, a.low )
7770         : lt128( a.high, a.low, b.high, b.low );
7771 
7772 }
7773 
7774 /*----------------------------------------------------------------------------
7775 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7776 | be compared, and 0 otherwise.  The invalid exception is raised if either
7777 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7778 | Standard for Binary Floating-Point Arithmetic.
7779 *----------------------------------------------------------------------------*/
7780 
7781 int float128_unordered(float128 a, float128 b, float_status *status)
7782 {
7783     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7784               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7785          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7786               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7787        ) {
7788         float_raise(float_flag_invalid, status);
7789         return 1;
7790     }
7791     return 0;
7792 }
7793 
7794 /*----------------------------------------------------------------------------
7795 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7796 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7797 | exception.  The comparison is performed according to the IEC/IEEE Standard
7798 | for Binary Floating-Point Arithmetic.
7799 *----------------------------------------------------------------------------*/
7800 
7801 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7802 {
7803 
7804     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7805               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7806          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7807               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7808        ) {
7809         if (float128_is_signaling_nan(a, status)
7810          || float128_is_signaling_nan(b, status)) {
7811             float_raise(float_flag_invalid, status);
7812         }
7813         return 0;
7814     }
7815     return
7816            ( a.low == b.low )
7817         && (    ( a.high == b.high )
7818              || (    ( a.low == 0 )
7819                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7820            );
7821 
7822 }
7823 
7824 /*----------------------------------------------------------------------------
7825 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7826 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7827 | cause an exception.  Otherwise, the comparison is performed according to the
7828 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7829 *----------------------------------------------------------------------------*/
7830 
7831 int float128_le_quiet(float128 a, float128 b, float_status *status)
7832 {
7833     flag aSign, bSign;
7834 
7835     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7836               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7837          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7838               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7839        ) {
7840         if (float128_is_signaling_nan(a, status)
7841          || float128_is_signaling_nan(b, status)) {
7842             float_raise(float_flag_invalid, status);
7843         }
7844         return 0;
7845     }
7846     aSign = extractFloat128Sign( a );
7847     bSign = extractFloat128Sign( b );
7848     if ( aSign != bSign ) {
7849         return
7850                aSign
7851             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7852                  == 0 );
7853     }
7854     return
7855           aSign ? le128( b.high, b.low, a.high, a.low )
7856         : le128( a.high, a.low, b.high, b.low );
7857 
7858 }
7859 
7860 /*----------------------------------------------------------------------------
7861 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7862 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7863 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7864 | Standard for Binary Floating-Point Arithmetic.
7865 *----------------------------------------------------------------------------*/
7866 
7867 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7868 {
7869     flag aSign, bSign;
7870 
7871     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7872               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7873          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7874               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7875        ) {
7876         if (float128_is_signaling_nan(a, status)
7877          || float128_is_signaling_nan(b, status)) {
7878             float_raise(float_flag_invalid, status);
7879         }
7880         return 0;
7881     }
7882     aSign = extractFloat128Sign( a );
7883     bSign = extractFloat128Sign( b );
7884     if ( aSign != bSign ) {
7885         return
7886                aSign
7887             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7888                  != 0 );
7889     }
7890     return
7891           aSign ? lt128( b.high, b.low, a.high, a.low )
7892         : lt128( a.high, a.low, b.high, b.low );
7893 
7894 }
7895 
7896 /*----------------------------------------------------------------------------
7897 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7898 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7899 | comparison is performed according to the IEC/IEEE Standard for Binary
7900 | Floating-Point Arithmetic.
7901 *----------------------------------------------------------------------------*/
7902 
7903 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7904 {
7905     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7906               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7907          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7908               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7909        ) {
7910         if (float128_is_signaling_nan(a, status)
7911          || float128_is_signaling_nan(b, status)) {
7912             float_raise(float_flag_invalid, status);
7913         }
7914         return 1;
7915     }
7916     return 0;
7917 }
7918 
7919 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7920                                             int is_quiet, float_status *status)
7921 {
7922     flag aSign, bSign;
7923 
7924     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7925         float_raise(float_flag_invalid, status);
7926         return float_relation_unordered;
7927     }
7928     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7929           ( extractFloatx80Frac( a )<<1 ) ) ||
7930         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7931           ( extractFloatx80Frac( b )<<1 ) )) {
7932         if (!is_quiet ||
7933             floatx80_is_signaling_nan(a, status) ||
7934             floatx80_is_signaling_nan(b, status)) {
7935             float_raise(float_flag_invalid, status);
7936         }
7937         return float_relation_unordered;
7938     }
7939     aSign = extractFloatx80Sign( a );
7940     bSign = extractFloatx80Sign( b );
7941     if ( aSign != bSign ) {
7942 
7943         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7944              ( ( a.low | b.low ) == 0 ) ) {
7945             /* zero case */
7946             return float_relation_equal;
7947         } else {
7948             return 1 - (2 * aSign);
7949         }
7950     } else {
7951         if (a.low == b.low && a.high == b.high) {
7952             return float_relation_equal;
7953         } else {
7954             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7955         }
7956     }
7957 }
7958 
7959 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7960 {
7961     return floatx80_compare_internal(a, b, 0, status);
7962 }
7963 
7964 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7965 {
7966     return floatx80_compare_internal(a, b, 1, status);
7967 }
7968 
7969 static inline int float128_compare_internal(float128 a, float128 b,
7970                                             int is_quiet, float_status *status)
7971 {
7972     flag aSign, bSign;
7973 
7974     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7975           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7976         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7977           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7978         if (!is_quiet ||
7979             float128_is_signaling_nan(a, status) ||
7980             float128_is_signaling_nan(b, status)) {
7981             float_raise(float_flag_invalid, status);
7982         }
7983         return float_relation_unordered;
7984     }
7985     aSign = extractFloat128Sign( a );
7986     bSign = extractFloat128Sign( b );
7987     if ( aSign != bSign ) {
7988         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7989             /* zero case */
7990             return float_relation_equal;
7991         } else {
7992             return 1 - (2 * aSign);
7993         }
7994     } else {
7995         if (a.low == b.low && a.high == b.high) {
7996             return float_relation_equal;
7997         } else {
7998             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7999         }
8000     }
8001 }
8002 
8003 int float128_compare(float128 a, float128 b, float_status *status)
8004 {
8005     return float128_compare_internal(a, b, 0, status);
8006 }
8007 
8008 int float128_compare_quiet(float128 a, float128 b, float_status *status)
8009 {
8010     return float128_compare_internal(a, b, 1, status);
8011 }
8012 
8013 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8014 {
8015     flag aSign;
8016     int32_t aExp;
8017     uint64_t aSig;
8018 
8019     if (floatx80_invalid_encoding(a)) {
8020         float_raise(float_flag_invalid, status);
8021         return floatx80_default_nan(status);
8022     }
8023     aSig = extractFloatx80Frac( a );
8024     aExp = extractFloatx80Exp( a );
8025     aSign = extractFloatx80Sign( a );
8026 
8027     if ( aExp == 0x7FFF ) {
8028         if ( aSig<<1 ) {
8029             return propagateFloatx80NaN(a, a, status);
8030         }
8031         return a;
8032     }
8033 
8034     if (aExp == 0) {
8035         if (aSig == 0) {
8036             return a;
8037         }
8038         aExp++;
8039     }
8040 
8041     if (n > 0x10000) {
8042         n = 0x10000;
8043     } else if (n < -0x10000) {
8044         n = -0x10000;
8045     }
8046 
8047     aExp += n;
8048     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8049                                          aSign, aExp, aSig, 0, status);
8050 }
8051 
8052 float128 float128_scalbn(float128 a, int n, float_status *status)
8053 {
8054     flag aSign;
8055     int32_t aExp;
8056     uint64_t aSig0, aSig1;
8057 
8058     aSig1 = extractFloat128Frac1( a );
8059     aSig0 = extractFloat128Frac0( a );
8060     aExp = extractFloat128Exp( a );
8061     aSign = extractFloat128Sign( a );
8062     if ( aExp == 0x7FFF ) {
8063         if ( aSig0 | aSig1 ) {
8064             return propagateFloat128NaN(a, a, status);
8065         }
8066         return a;
8067     }
8068     if (aExp != 0) {
8069         aSig0 |= UINT64_C(0x0001000000000000);
8070     } else if (aSig0 == 0 && aSig1 == 0) {
8071         return a;
8072     } else {
8073         aExp++;
8074     }
8075 
8076     if (n > 0x10000) {
8077         n = 0x10000;
8078     } else if (n < -0x10000) {
8079         n = -0x10000;
8080     }
8081 
8082     aExp += n - 1;
8083     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8084                                          , status);
8085 
8086 }
8087 
8088 static void __attribute__((constructor)) softfloat_init(void)
8089 {
8090     union_float64 ua, ub, uc, ur;
8091 
8092     if (QEMU_NO_HARDFLOAT) {
8093         return;
8094     }
8095     /*
8096      * Test that the host's FMA is not obviously broken. For example,
8097      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8098      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8099      */
8100     ua.s = 0x0020000000000001ULL;
8101     ub.s = 0x3ca0000000000000ULL;
8102     uc.s = 0x0020000000000000ULL;
8103     ur.h = fma(ua.h, ub.h, uc.h);
8104     if (ur.s != 0x0020000000000001ULL) {
8105         force_soft_fma = true;
8106     }
8107 }
8108