xref: /openbmc/qemu/fpu/softfloat.c (revision f3635813)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat32Frac(float32 a)
422 {
423     return float32_val(a) & 0x007FFFFF;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat32Exp(float32 a)
431 {
432     return (float32_val(a) >> 23) & 0xFF;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline flag extractFloat32Sign(float32 a)
440 {
441     return float32_val(a) >> 31;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline uint64_t extractFloat64Frac(float64 a)
449 {
450     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline int extractFloat64Exp(float64 a)
458 {
459     return (float64_val(a) >> 52) & 0x7FF;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline flag extractFloat64Sign(float64 a)
467 {
468     return float64_val(a) >> 63;
469 }
470 
471 /*
472  * Classify a floating point number. Everything above float_class_qnan
473  * is a NaN so cls >= float_class_qnan is any NaN.
474  */
475 
476 typedef enum __attribute__ ((__packed__)) {
477     float_class_unclassified,
478     float_class_zero,
479     float_class_normal,
480     float_class_inf,
481     float_class_qnan,  /* all NaNs from here */
482     float_class_snan,
483 } FloatClass;
484 
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused)) bool is_nan(FloatClass c)
487 {
488     return unlikely(c >= float_class_qnan);
489 }
490 
491 static inline __attribute__((unused)) bool is_snan(FloatClass c)
492 {
493     return c == float_class_snan;
494 }
495 
496 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
497 {
498     return c == float_class_qnan;
499 }
500 
501 /*
502  * Structure holding all of the decomposed parts of a float. The
503  * exponent is unbiased and the fraction is normalized. All
504  * calculations are done with a 64 bit fraction and then rounded as
505  * appropriate for the final format.
506  *
507  * Thanks to the packed FloatClass a decent compiler should be able to
508  * fit the whole structure into registers and avoid using the stack
509  * for parameter passing.
510  */
511 
512 typedef struct {
513     uint64_t frac;
514     int32_t  exp;
515     FloatClass cls;
516     bool sign;
517 } FloatParts;
518 
519 #define DECOMPOSED_BINARY_POINT    (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
522 
523 /* Structure holding all of the relevant parameters for a format.
524  *   exp_size: the size of the exponent field
525  *   exp_bias: the offset applied to the exponent field
526  *   exp_max: the maximum normalised exponent
527  *   frac_size: the size of the fraction field
528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529  * The following are computed based the size of fraction
530  *   frac_lsb: least significant bit of fraction
531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
532  *   round_mask/roundeven_mask: masks used for rounding
533  * The following optional modifiers are available:
534  *   arm_althp: handle ARM Alternative Half Precision
535  */
536 typedef struct {
537     int exp_size;
538     int exp_bias;
539     int exp_max;
540     int frac_size;
541     int frac_shift;
542     uint64_t frac_lsb;
543     uint64_t frac_lsbm1;
544     uint64_t round_mask;
545     uint64_t roundeven_mask;
546     bool arm_althp;
547 } FloatFmt;
548 
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F)                                           \
551     .exp_size       = E,                                             \
552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
553     .exp_max        = (1 << E) - 1,                                  \
554     .frac_size      = F,                                             \
555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560 
561 static const FloatFmt float16_params = {
562     FLOAT_PARAMS(5, 10)
563 };
564 
565 static const FloatFmt float16_params_ahp = {
566     FLOAT_PARAMS(5, 10),
567     .arm_althp = true
568 };
569 
570 static const FloatFmt float32_params = {
571     FLOAT_PARAMS(8, 23)
572 };
573 
574 static const FloatFmt float64_params = {
575     FLOAT_PARAMS(11, 52)
576 };
577 
578 /* Unpack a float to parts, but do not canonicalize.  */
579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
580 {
581     const int sign_pos = fmt.frac_size + fmt.exp_size;
582 
583     return (FloatParts) {
584         .cls = float_class_unclassified,
585         .sign = extract64(raw, sign_pos, 1),
586         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
587         .frac = extract64(raw, 0, fmt.frac_size),
588     };
589 }
590 
591 static inline FloatParts float16_unpack_raw(float16 f)
592 {
593     return unpack_raw(float16_params, f);
594 }
595 
596 static inline FloatParts float32_unpack_raw(float32 f)
597 {
598     return unpack_raw(float32_params, f);
599 }
600 
601 static inline FloatParts float64_unpack_raw(float64 f)
602 {
603     return unpack_raw(float64_params, f);
604 }
605 
606 /* Pack a float from parts, but do not canonicalize.  */
607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
608 {
609     const int sign_pos = fmt.frac_size + fmt.exp_size;
610     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
611     return deposit64(ret, sign_pos, 1, p.sign);
612 }
613 
614 static inline float16 float16_pack_raw(FloatParts p)
615 {
616     return make_float16(pack_raw(float16_params, p));
617 }
618 
619 static inline float32 float32_pack_raw(FloatParts p)
620 {
621     return make_float32(pack_raw(float32_params, p));
622 }
623 
624 static inline float64 float64_pack_raw(FloatParts p)
625 {
626     return make_float64(pack_raw(float64_params, p));
627 }
628 
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine:  (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output.  These details are target-
635 | specific.
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
638 
639 /* Canonicalize EXP and FRAC, setting CLS.  */
640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
641                                   float_status *status)
642 {
643     if (part.exp == parm->exp_max && !parm->arm_althp) {
644         if (part.frac == 0) {
645             part.cls = float_class_inf;
646         } else {
647             part.frac <<= parm->frac_shift;
648             part.cls = (parts_is_snan_frac(part.frac, status)
649                         ? float_class_snan : float_class_qnan);
650         }
651     } else if (part.exp == 0) {
652         if (likely(part.frac == 0)) {
653             part.cls = float_class_zero;
654         } else if (status->flush_inputs_to_zero) {
655             float_raise(float_flag_input_denormal, status);
656             part.cls = float_class_zero;
657             part.frac = 0;
658         } else {
659             int shift = clz64(part.frac) - 1;
660             part.cls = float_class_normal;
661             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
662             part.frac <<= shift;
663         }
664     } else {
665         part.cls = float_class_normal;
666         part.exp -= parm->exp_bias;
667         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
668     }
669     return part;
670 }
671 
672 /* Round and uncanonicalize a floating-point number by parts. There
673  * are FRAC_SHIFT bits that may require rounding at the bottom of the
674  * fraction; these bits will be removed. The exponent will be biased
675  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
676  */
677 
678 static FloatParts round_canonical(FloatParts p, float_status *s,
679                                   const FloatFmt *parm)
680 {
681     const uint64_t frac_lsb = parm->frac_lsb;
682     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
683     const uint64_t round_mask = parm->round_mask;
684     const uint64_t roundeven_mask = parm->roundeven_mask;
685     const int exp_max = parm->exp_max;
686     const int frac_shift = parm->frac_shift;
687     uint64_t frac, inc;
688     int exp, flags = 0;
689     bool overflow_norm;
690 
691     frac = p.frac;
692     exp = p.exp;
693 
694     switch (p.cls) {
695     case float_class_normal:
696         switch (s->float_rounding_mode) {
697         case float_round_nearest_even:
698             overflow_norm = false;
699             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
700             break;
701         case float_round_ties_away:
702             overflow_norm = false;
703             inc = frac_lsbm1;
704             break;
705         case float_round_to_zero:
706             overflow_norm = true;
707             inc = 0;
708             break;
709         case float_round_up:
710             inc = p.sign ? 0 : round_mask;
711             overflow_norm = p.sign;
712             break;
713         case float_round_down:
714             inc = p.sign ? round_mask : 0;
715             overflow_norm = !p.sign;
716             break;
717         case float_round_to_odd:
718             overflow_norm = true;
719             inc = frac & frac_lsb ? 0 : round_mask;
720             break;
721         default:
722             g_assert_not_reached();
723         }
724 
725         exp += parm->exp_bias;
726         if (likely(exp > 0)) {
727             if (frac & round_mask) {
728                 flags |= float_flag_inexact;
729                 frac += inc;
730                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
731                     frac >>= 1;
732                     exp++;
733                 }
734             }
735             frac >>= frac_shift;
736 
737             if (parm->arm_althp) {
738                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
739                 if (unlikely(exp > exp_max)) {
740                     /* Overflow.  Return the maximum normal.  */
741                     flags = float_flag_invalid;
742                     exp = exp_max;
743                     frac = -1;
744                 }
745             } else if (unlikely(exp >= exp_max)) {
746                 flags |= float_flag_overflow | float_flag_inexact;
747                 if (overflow_norm) {
748                     exp = exp_max - 1;
749                     frac = -1;
750                 } else {
751                     p.cls = float_class_inf;
752                     goto do_inf;
753                 }
754             }
755         } else if (s->flush_to_zero) {
756             flags |= float_flag_output_denormal;
757             p.cls = float_class_zero;
758             goto do_zero;
759         } else {
760             bool is_tiny = (s->float_detect_tininess
761                             == float_tininess_before_rounding)
762                         || (exp < 0)
763                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
764 
765             shift64RightJamming(frac, 1 - exp, &frac);
766             if (frac & round_mask) {
767                 /* Need to recompute round-to-even.  */
768                 switch (s->float_rounding_mode) {
769                 case float_round_nearest_even:
770                     inc = ((frac & roundeven_mask) != frac_lsbm1
771                            ? frac_lsbm1 : 0);
772                     break;
773                 case float_round_to_odd:
774                     inc = frac & frac_lsb ? 0 : round_mask;
775                     break;
776                 }
777                 flags |= float_flag_inexact;
778                 frac += inc;
779             }
780 
781             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
782             frac >>= frac_shift;
783 
784             if (is_tiny && (flags & float_flag_inexact)) {
785                 flags |= float_flag_underflow;
786             }
787             if (exp == 0 && frac == 0) {
788                 p.cls = float_class_zero;
789             }
790         }
791         break;
792 
793     case float_class_zero:
794     do_zero:
795         exp = 0;
796         frac = 0;
797         break;
798 
799     case float_class_inf:
800     do_inf:
801         assert(!parm->arm_althp);
802         exp = exp_max;
803         frac = 0;
804         break;
805 
806     case float_class_qnan:
807     case float_class_snan:
808         assert(!parm->arm_althp);
809         exp = exp_max;
810         frac >>= parm->frac_shift;
811         break;
812 
813     default:
814         g_assert_not_reached();
815     }
816 
817     float_raise(flags, s);
818     p.exp = exp;
819     p.frac = frac;
820     return p;
821 }
822 
823 /* Explicit FloatFmt version */
824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
825                                             const FloatFmt *params)
826 {
827     return sf_canonicalize(float16_unpack_raw(f), params, s);
828 }
829 
830 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
831 {
832     return float16a_unpack_canonical(f, s, &float16_params);
833 }
834 
835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
836                                              const FloatFmt *params)
837 {
838     return float16_pack_raw(round_canonical(p, s, params));
839 }
840 
841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
842 {
843     return float16a_round_pack_canonical(p, s, &float16_params);
844 }
845 
846 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
847 {
848     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
849 }
850 
851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
852 {
853     return float32_pack_raw(round_canonical(p, s, &float32_params));
854 }
855 
856 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
857 {
858     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
859 }
860 
861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
862 {
863     return float64_pack_raw(round_canonical(p, s, &float64_params));
864 }
865 
866 static FloatParts return_nan(FloatParts a, float_status *s)
867 {
868     switch (a.cls) {
869     case float_class_snan:
870         s->float_exception_flags |= float_flag_invalid;
871         a = parts_silence_nan(a, s);
872         /* fall through */
873     case float_class_qnan:
874         if (s->default_nan_mode) {
875             return parts_default_nan(s);
876         }
877         break;
878 
879     default:
880         g_assert_not_reached();
881     }
882     return a;
883 }
884 
885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
886 {
887     if (is_snan(a.cls) || is_snan(b.cls)) {
888         s->float_exception_flags |= float_flag_invalid;
889     }
890 
891     if (s->default_nan_mode) {
892         return parts_default_nan(s);
893     } else {
894         if (pickNaN(a.cls, b.cls,
895                     a.frac > b.frac ||
896                     (a.frac == b.frac && a.sign < b.sign))) {
897             a = b;
898         }
899         if (is_snan(a.cls)) {
900             return parts_silence_nan(a, s);
901         }
902     }
903     return a;
904 }
905 
906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
907                                   bool inf_zero, float_status *s)
908 {
909     int which;
910 
911     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
912         s->float_exception_flags |= float_flag_invalid;
913     }
914 
915     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
916 
917     if (s->default_nan_mode) {
918         /* Note that this check is after pickNaNMulAdd so that function
919          * has an opportunity to set the Invalid flag.
920          */
921         which = 3;
922     }
923 
924     switch (which) {
925     case 0:
926         break;
927     case 1:
928         a = b;
929         break;
930     case 2:
931         a = c;
932         break;
933     case 3:
934         return parts_default_nan(s);
935     default:
936         g_assert_not_reached();
937     }
938 
939     if (is_snan(a.cls)) {
940         return parts_silence_nan(a, s);
941     }
942     return a;
943 }
944 
945 /*
946  * Returns the result of adding or subtracting the values of the
947  * floating-point values `a' and `b'. The operation is performed
948  * according to the IEC/IEEE Standard for Binary Floating-Point
949  * Arithmetic.
950  */
951 
952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
953                                 float_status *s)
954 {
955     bool a_sign = a.sign;
956     bool b_sign = b.sign ^ subtract;
957 
958     if (a_sign != b_sign) {
959         /* Subtraction */
960 
961         if (a.cls == float_class_normal && b.cls == float_class_normal) {
962             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
963                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
964                 a.frac = a.frac - b.frac;
965             } else {
966                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
967                 a.frac = b.frac - a.frac;
968                 a.exp = b.exp;
969                 a_sign ^= 1;
970             }
971 
972             if (a.frac == 0) {
973                 a.cls = float_class_zero;
974                 a.sign = s->float_rounding_mode == float_round_down;
975             } else {
976                 int shift = clz64(a.frac) - 1;
977                 a.frac = a.frac << shift;
978                 a.exp = a.exp - shift;
979                 a.sign = a_sign;
980             }
981             return a;
982         }
983         if (is_nan(a.cls) || is_nan(b.cls)) {
984             return pick_nan(a, b, s);
985         }
986         if (a.cls == float_class_inf) {
987             if (b.cls == float_class_inf) {
988                 float_raise(float_flag_invalid, s);
989                 return parts_default_nan(s);
990             }
991             return a;
992         }
993         if (a.cls == float_class_zero && b.cls == float_class_zero) {
994             a.sign = s->float_rounding_mode == float_round_down;
995             return a;
996         }
997         if (a.cls == float_class_zero || b.cls == float_class_inf) {
998             b.sign = a_sign ^ 1;
999             return b;
1000         }
1001         if (b.cls == float_class_zero) {
1002             return a;
1003         }
1004     } else {
1005         /* Addition */
1006         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007             if (a.exp > b.exp) {
1008                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009             } else if (a.exp < b.exp) {
1010                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011                 a.exp = b.exp;
1012             }
1013             a.frac += b.frac;
1014             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015                 shift64RightJamming(a.frac, 1, &a.frac);
1016                 a.exp += 1;
1017             }
1018             return a;
1019         }
1020         if (is_nan(a.cls) || is_nan(b.cls)) {
1021             return pick_nan(a, b, s);
1022         }
1023         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024             return a;
1025         }
1026         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027             b.sign = b_sign;
1028             return b;
1029         }
1030     }
1031     g_assert_not_reached();
1032 }
1033 
1034 /*
1035  * Returns the result of adding or subtracting the floating-point
1036  * values `a' and `b'. The operation is performed according to the
1037  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038  */
1039 
1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1041 {
1042     FloatParts pa = float16_unpack_canonical(a, status);
1043     FloatParts pb = float16_unpack_canonical(b, status);
1044     FloatParts pr = addsub_floats(pa, pb, false, status);
1045 
1046     return float16_round_pack_canonical(pr, status);
1047 }
1048 
1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1050 {
1051     FloatParts pa = float16_unpack_canonical(a, status);
1052     FloatParts pb = float16_unpack_canonical(b, status);
1053     FloatParts pr = addsub_floats(pa, pb, true, status);
1054 
1055     return float16_round_pack_canonical(pr, status);
1056 }
1057 
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1060 {
1061     FloatParts pa = float32_unpack_canonical(a, status);
1062     FloatParts pb = float32_unpack_canonical(b, status);
1063     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1064 
1065     return float32_round_pack_canonical(pr, status);
1066 }
1067 
1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1069 {
1070     return soft_f32_addsub(a, b, false, status);
1071 }
1072 
1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1074 {
1075     return soft_f32_addsub(a, b, true, status);
1076 }
1077 
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1080 {
1081     FloatParts pa = float64_unpack_canonical(a, status);
1082     FloatParts pb = float64_unpack_canonical(b, status);
1083     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1084 
1085     return float64_round_pack_canonical(pr, status);
1086 }
1087 
1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1089 {
1090     return soft_f64_addsub(a, b, false, status);
1091 }
1092 
1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1094 {
1095     return soft_f64_addsub(a, b, true, status);
1096 }
1097 
1098 static float hard_f32_add(float a, float b)
1099 {
1100     return a + b;
1101 }
1102 
1103 static float hard_f32_sub(float a, float b)
1104 {
1105     return a - b;
1106 }
1107 
1108 static double hard_f64_add(double a, double b)
1109 {
1110     return a + b;
1111 }
1112 
1113 static double hard_f64_sub(double a, double b)
1114 {
1115     return a - b;
1116 }
1117 
1118 static bool f32_addsub_post(union_float32 a, union_float32 b)
1119 {
1120     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1122     }
1123     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1124 }
1125 
1126 static bool f64_addsub_post(union_float64 a, union_float64 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     } else {
1131         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1132     }
1133 }
1134 
1135 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1137 {
1138     return float32_gen2(a, b, s, hard, soft,
1139                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1140 }
1141 
1142 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1144 {
1145     return float64_gen2(a, b, s, hard, soft,
1146                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1147 }
1148 
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a, float32 b, float_status *s)
1151 {
1152     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1153 }
1154 
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a, float32 b, float_status *s)
1157 {
1158     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1159 }
1160 
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a, float64 b, float_status *s)
1163 {
1164     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1165 }
1166 
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a, float64 b, float_status *s)
1169 {
1170     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1171 }
1172 
1173 /*
1174  * Returns the result of multiplying the floating-point values `a' and
1175  * `b'. The operation is performed according to the IEC/IEEE Standard
1176  * for Binary Floating-Point Arithmetic.
1177  */
1178 
1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1180 {
1181     bool sign = a.sign ^ b.sign;
1182 
1183     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184         uint64_t hi, lo;
1185         int exp = a.exp + b.exp;
1186 
1187         mul64To128(a.frac, b.frac, &hi, &lo);
1188         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190             shift64RightJamming(lo, 1, &lo);
1191             exp += 1;
1192         }
1193 
1194         /* Re-use a */
1195         a.exp = exp;
1196         a.sign = sign;
1197         a.frac = lo;
1198         return a;
1199     }
1200     /* handle all the NaN cases */
1201     if (is_nan(a.cls) || is_nan(b.cls)) {
1202         return pick_nan(a, b, s);
1203     }
1204     /* Inf * Zero == NaN */
1205     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207         s->float_exception_flags |= float_flag_invalid;
1208         return parts_default_nan(s);
1209     }
1210     /* Multiply by 0 or Inf */
1211     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212         a.sign = sign;
1213         return a;
1214     }
1215     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216         b.sign = sign;
1217         return b;
1218     }
1219     g_assert_not_reached();
1220 }
1221 
1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1223 {
1224     FloatParts pa = float16_unpack_canonical(a, status);
1225     FloatParts pb = float16_unpack_canonical(b, status);
1226     FloatParts pr = mul_floats(pa, pb, status);
1227 
1228     return float16_round_pack_canonical(pr, status);
1229 }
1230 
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a, float32 b, float_status *status)
1233 {
1234     FloatParts pa = float32_unpack_canonical(a, status);
1235     FloatParts pb = float32_unpack_canonical(b, status);
1236     FloatParts pr = mul_floats(pa, pb, status);
1237 
1238     return float32_round_pack_canonical(pr, status);
1239 }
1240 
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a, float64 b, float_status *status)
1243 {
1244     FloatParts pa = float64_unpack_canonical(a, status);
1245     FloatParts pb = float64_unpack_canonical(b, status);
1246     FloatParts pr = mul_floats(pa, pb, status);
1247 
1248     return float64_round_pack_canonical(pr, status);
1249 }
1250 
1251 static float hard_f32_mul(float a, float b)
1252 {
1253     return a * b;
1254 }
1255 
1256 static double hard_f64_mul(double a, double b)
1257 {
1258     return a * b;
1259 }
1260 
1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1262 {
1263     return float32_is_zero(a.s) || float32_is_zero(b.s);
1264 }
1265 
1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1267 {
1268     return float64_is_zero(a.s) || float64_is_zero(b.s);
1269 }
1270 
1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1272 {
1273     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1274 
1275     return float32_set_sign(float32_zero, signbit);
1276 }
1277 
1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1279 {
1280     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1281 
1282     return float64_set_sign(float64_zero, signbit);
1283 }
1284 
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a, float32 b, float_status *s)
1287 {
1288     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1290 }
1291 
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a, float64 b, float_status *s)
1294 {
1295     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1297 }
1298 
1299 /*
1300  * Returns the result of multiplying the floating-point values `a' and
1301  * `b' then adding 'c', with no intermediate rounding step after the
1302  * multiplication. The operation is performed according to the
1303  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304  * The flags argument allows the caller to select negation of the
1305  * addend, the intermediate product, or the final result. (The
1306  * difference between this and having the caller do a separate
1307  * negation is that negating externally will flip the sign bit on
1308  * NaNs.)
1309  */
1310 
1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312                                 int flags, float_status *s)
1313 {
1314     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315                     ((1 << float_class_inf) | (1 << float_class_zero));
1316     bool p_sign;
1317     bool sign_flip = flags & float_muladd_negate_result;
1318     FloatClass p_class;
1319     uint64_t hi, lo;
1320     int p_exp;
1321 
1322     /* It is implementation-defined whether the cases of (0,inf,qnan)
1323      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324      * they return if they do), so we have to hand this information
1325      * off to the target-specific pick-a-NaN routine.
1326      */
1327     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328         return pick_nan_muladd(a, b, c, inf_zero, s);
1329     }
1330 
1331     if (inf_zero) {
1332         s->float_exception_flags |= float_flag_invalid;
1333         return parts_default_nan(s);
1334     }
1335 
1336     if (flags & float_muladd_negate_c) {
1337         c.sign ^= 1;
1338     }
1339 
1340     p_sign = a.sign ^ b.sign;
1341 
1342     if (flags & float_muladd_negate_product) {
1343         p_sign ^= 1;
1344     }
1345 
1346     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347         p_class = float_class_inf;
1348     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349         p_class = float_class_zero;
1350     } else {
1351         p_class = float_class_normal;
1352     }
1353 
1354     if (c.cls == float_class_inf) {
1355         if (p_class == float_class_inf && p_sign != c.sign) {
1356             s->float_exception_flags |= float_flag_invalid;
1357             return parts_default_nan(s);
1358         } else {
1359             a.cls = float_class_inf;
1360             a.sign = c.sign ^ sign_flip;
1361             return a;
1362         }
1363     }
1364 
1365     if (p_class == float_class_inf) {
1366         a.cls = float_class_inf;
1367         a.sign = p_sign ^ sign_flip;
1368         return a;
1369     }
1370 
1371     if (p_class == float_class_zero) {
1372         if (c.cls == float_class_zero) {
1373             if (p_sign != c.sign) {
1374                 p_sign = s->float_rounding_mode == float_round_down;
1375             }
1376             c.sign = p_sign;
1377         } else if (flags & float_muladd_halve_result) {
1378             c.exp -= 1;
1379         }
1380         c.sign ^= sign_flip;
1381         return c;
1382     }
1383 
1384     /* a & b should be normals now... */
1385     assert(a.cls == float_class_normal &&
1386            b.cls == float_class_normal);
1387 
1388     p_exp = a.exp + b.exp;
1389 
1390     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391      * result.
1392      */
1393     mul64To128(a.frac, b.frac, &hi, &lo);
1394     /* binary point now at bit 124 */
1395 
1396     /* check for overflow */
1397     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398         shift128RightJamming(hi, lo, 1, &hi, &lo);
1399         p_exp += 1;
1400     }
1401 
1402     /* + add/sub */
1403     if (c.cls == float_class_zero) {
1404         /* move binary point back to 62 */
1405         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406     } else {
1407         int exp_diff = p_exp - c.exp;
1408         if (p_sign == c.sign) {
1409             /* Addition */
1410             if (exp_diff <= 0) {
1411                 shift128RightJamming(hi, lo,
1412                                      DECOMPOSED_BINARY_POINT - exp_diff,
1413                                      &hi, &lo);
1414                 lo += c.frac;
1415                 p_exp = c.exp;
1416             } else {
1417                 uint64_t c_hi, c_lo;
1418                 /* shift c to the same binary point as the product (124) */
1419                 c_hi = c.frac >> 2;
1420                 c_lo = 0;
1421                 shift128RightJamming(c_hi, c_lo,
1422                                      exp_diff,
1423                                      &c_hi, &c_lo);
1424                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425                 /* move binary point back to 62 */
1426                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1427             }
1428 
1429             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430                 shift64RightJamming(lo, 1, &lo);
1431                 p_exp += 1;
1432             }
1433 
1434         } else {
1435             /* Subtraction */
1436             uint64_t c_hi, c_lo;
1437             /* make C binary point match product at bit 124 */
1438             c_hi = c.frac >> 2;
1439             c_lo = 0;
1440 
1441             if (exp_diff <= 0) {
1442                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443                 if (exp_diff == 0
1444                     &&
1445                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447                 } else {
1448                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449                     p_sign ^= 1;
1450                     p_exp = c.exp;
1451                 }
1452             } else {
1453                 shift128RightJamming(c_hi, c_lo,
1454                                      exp_diff,
1455                                      &c_hi, &c_lo);
1456                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1457             }
1458 
1459             if (hi == 0 && lo == 0) {
1460                 a.cls = float_class_zero;
1461                 a.sign = s->float_rounding_mode == float_round_down;
1462                 a.sign ^= sign_flip;
1463                 return a;
1464             } else {
1465                 int shift;
1466                 if (hi != 0) {
1467                     shift = clz64(hi);
1468                 } else {
1469                     shift = clz64(lo) + 64;
1470                 }
1471                 /* Normalizing to a binary point of 124 is the
1472                    correct adjust for the exponent.  However since we're
1473                    shifting, we might as well put the binary point back
1474                    at 62 where we really want it.  Therefore shift as
1475                    if we're leaving 1 bit at the top of the word, but
1476                    adjust the exponent as if we're leaving 3 bits.  */
1477                 shift -= 1;
1478                 if (shift >= 64) {
1479                     lo = lo << (shift - 64);
1480                 } else {
1481                     hi = (hi << shift) | (lo >> (64 - shift));
1482                     lo = hi | ((lo << shift) != 0);
1483                 }
1484                 p_exp -= shift - 2;
1485             }
1486         }
1487     }
1488 
1489     if (flags & float_muladd_halve_result) {
1490         p_exp -= 1;
1491     }
1492 
1493     /* finally prepare our result */
1494     a.cls = float_class_normal;
1495     a.sign = p_sign ^ sign_flip;
1496     a.exp = p_exp;
1497     a.frac = lo;
1498 
1499     return a;
1500 }
1501 
1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503                                                 int flags, float_status *status)
1504 {
1505     FloatParts pa = float16_unpack_canonical(a, status);
1506     FloatParts pb = float16_unpack_canonical(b, status);
1507     FloatParts pc = float16_unpack_canonical(c, status);
1508     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1509 
1510     return float16_round_pack_canonical(pr, status);
1511 }
1512 
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515                 float_status *status)
1516 {
1517     FloatParts pa = float32_unpack_canonical(a, status);
1518     FloatParts pb = float32_unpack_canonical(b, status);
1519     FloatParts pc = float32_unpack_canonical(c, status);
1520     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1521 
1522     return float32_round_pack_canonical(pr, status);
1523 }
1524 
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527                 float_status *status)
1528 {
1529     FloatParts pa = float64_unpack_canonical(a, status);
1530     FloatParts pb = float64_unpack_canonical(b, status);
1531     FloatParts pc = float64_unpack_canonical(c, status);
1532     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533 
1534     return float64_round_pack_canonical(pr, status);
1535 }
1536 
1537 static bool force_soft_fma;
1538 
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1541 {
1542     union_float32 ua, ub, uc, ur;
1543 
1544     ua.s = xa;
1545     ub.s = xb;
1546     uc.s = xc;
1547 
1548     if (unlikely(!can_use_fpu(s))) {
1549         goto soft;
1550     }
1551     if (unlikely(flags & float_muladd_halve_result)) {
1552         goto soft;
1553     }
1554 
1555     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557         goto soft;
1558     }
1559 
1560     if (unlikely(force_soft_fma)) {
1561         goto soft;
1562     }
1563 
1564     /*
1565      * When (a || b) == 0, there's no need to check for under/over flow,
1566      * since we know the addend is (normal || 0) and the product is 0.
1567      */
1568     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569         union_float32 up;
1570         bool prod_sign;
1571 
1572         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573         prod_sign ^= !!(flags & float_muladd_negate_product);
1574         up.s = float32_set_sign(float32_zero, prod_sign);
1575 
1576         if (flags & float_muladd_negate_c) {
1577             uc.h = -uc.h;
1578         }
1579         ur.h = up.h + uc.h;
1580     } else {
1581         union_float32 ua_orig = ua;
1582         union_float32 uc_orig = uc;
1583 
1584         if (flags & float_muladd_negate_product) {
1585             ua.h = -ua.h;
1586         }
1587         if (flags & float_muladd_negate_c) {
1588             uc.h = -uc.h;
1589         }
1590 
1591         ur.h = fmaf(ua.h, ub.h, uc.h);
1592 
1593         if (unlikely(f32_is_inf(ur))) {
1594             s->float_exception_flags |= float_flag_overflow;
1595         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596             ua = ua_orig;
1597             uc = uc_orig;
1598             goto soft;
1599         }
1600     }
1601     if (flags & float_muladd_negate_result) {
1602         return float32_chs(ur.s);
1603     }
1604     return ur.s;
1605 
1606  soft:
1607     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1608 }
1609 
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1612 {
1613     union_float64 ua, ub, uc, ur;
1614 
1615     ua.s = xa;
1616     ub.s = xb;
1617     uc.s = xc;
1618 
1619     if (unlikely(!can_use_fpu(s))) {
1620         goto soft;
1621     }
1622     if (unlikely(flags & float_muladd_halve_result)) {
1623         goto soft;
1624     }
1625 
1626     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628         goto soft;
1629     }
1630 
1631     if (unlikely(force_soft_fma)) {
1632         goto soft;
1633     }
1634 
1635     /*
1636      * When (a || b) == 0, there's no need to check for under/over flow,
1637      * since we know the addend is (normal || 0) and the product is 0.
1638      */
1639     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640         union_float64 up;
1641         bool prod_sign;
1642 
1643         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644         prod_sign ^= !!(flags & float_muladd_negate_product);
1645         up.s = float64_set_sign(float64_zero, prod_sign);
1646 
1647         if (flags & float_muladd_negate_c) {
1648             uc.h = -uc.h;
1649         }
1650         ur.h = up.h + uc.h;
1651     } else {
1652         union_float64 ua_orig = ua;
1653         union_float64 uc_orig = uc;
1654 
1655         if (flags & float_muladd_negate_product) {
1656             ua.h = -ua.h;
1657         }
1658         if (flags & float_muladd_negate_c) {
1659             uc.h = -uc.h;
1660         }
1661 
1662         ur.h = fma(ua.h, ub.h, uc.h);
1663 
1664         if (unlikely(f64_is_inf(ur))) {
1665             s->float_exception_flags |= float_flag_overflow;
1666         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667             ua = ua_orig;
1668             uc = uc_orig;
1669             goto soft;
1670         }
1671     }
1672     if (flags & float_muladd_negate_result) {
1673         return float64_chs(ur.s);
1674     }
1675     return ur.s;
1676 
1677  soft:
1678     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1679 }
1680 
1681 /*
1682  * Returns the result of dividing the floating-point value `a' by the
1683  * corresponding value `b'. The operation is performed according to
1684  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685  */
1686 
1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1688 {
1689     bool sign = a.sign ^ b.sign;
1690 
1691     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692         uint64_t n0, n1, q, r;
1693         int exp = a.exp - b.exp;
1694 
1695         /*
1696          * We want a 2*N / N-bit division to produce exactly an N-bit
1697          * result, so that we do not lose any precision and so that we
1698          * do not have to renormalize afterward.  If A.frac < B.frac,
1699          * then division would produce an (N-1)-bit result; shift A left
1700          * by one to produce the an N-bit result, and decrement the
1701          * exponent to match.
1702          *
1703          * The udiv_qrnnd algorithm that we're using requires normalization,
1704          * i.e. the msb of the denominator must be set.  Since we know that
1705          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706          * by one (more), and the remainder must be shifted right by one.
1707          */
1708         if (a.frac < b.frac) {
1709             exp -= 1;
1710             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711         } else {
1712             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1713         }
1714         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715 
1716         /*
1717          * Set lsb if there is a remainder, to set inexact.
1718          * As mentioned above, to find the actual value of the remainder we
1719          * would need to shift right, but (1) we are only concerned about
1720          * non-zero-ness, and (2) the remainder will always be even because
1721          * both inputs to the division primitive are even.
1722          */
1723         a.frac = q | (r != 0);
1724         a.sign = sign;
1725         a.exp = exp;
1726         return a;
1727     }
1728     /* handle all the NaN cases */
1729     if (is_nan(a.cls) || is_nan(b.cls)) {
1730         return pick_nan(a, b, s);
1731     }
1732     /* 0/0 or Inf/Inf */
1733     if (a.cls == b.cls
1734         &&
1735         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736         s->float_exception_flags |= float_flag_invalid;
1737         return parts_default_nan(s);
1738     }
1739     /* Inf / x or 0 / x */
1740     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741         a.sign = sign;
1742         return a;
1743     }
1744     /* Div 0 => Inf */
1745     if (b.cls == float_class_zero) {
1746         s->float_exception_flags |= float_flag_divbyzero;
1747         a.cls = float_class_inf;
1748         a.sign = sign;
1749         return a;
1750     }
1751     /* Div by Inf */
1752     if (b.cls == float_class_inf) {
1753         a.cls = float_class_zero;
1754         a.sign = sign;
1755         return a;
1756     }
1757     g_assert_not_reached();
1758 }
1759 
1760 float16 float16_div(float16 a, float16 b, float_status *status)
1761 {
1762     FloatParts pa = float16_unpack_canonical(a, status);
1763     FloatParts pb = float16_unpack_canonical(b, status);
1764     FloatParts pr = div_floats(pa, pb, status);
1765 
1766     return float16_round_pack_canonical(pr, status);
1767 }
1768 
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a, float32 b, float_status *status)
1771 {
1772     FloatParts pa = float32_unpack_canonical(a, status);
1773     FloatParts pb = float32_unpack_canonical(b, status);
1774     FloatParts pr = div_floats(pa, pb, status);
1775 
1776     return float32_round_pack_canonical(pr, status);
1777 }
1778 
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a, float64 b, float_status *status)
1781 {
1782     FloatParts pa = float64_unpack_canonical(a, status);
1783     FloatParts pb = float64_unpack_canonical(b, status);
1784     FloatParts pr = div_floats(pa, pb, status);
1785 
1786     return float64_round_pack_canonical(pr, status);
1787 }
1788 
1789 static float hard_f32_div(float a, float b)
1790 {
1791     return a / b;
1792 }
1793 
1794 static double hard_f64_div(double a, double b)
1795 {
1796     return a / b;
1797 }
1798 
1799 static bool f32_div_pre(union_float32 a, union_float32 b)
1800 {
1801     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803                fpclassify(b.h) == FP_NORMAL;
1804     }
1805     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806 }
1807 
1808 static bool f64_div_pre(union_float64 a, union_float64 b)
1809 {
1810     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812                fpclassify(b.h) == FP_NORMAL;
1813     }
1814     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815 }
1816 
1817 static bool f32_div_post(union_float32 a, union_float32 b)
1818 {
1819     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820         return fpclassify(a.h) != FP_ZERO;
1821     }
1822     return !float32_is_zero(a.s);
1823 }
1824 
1825 static bool f64_div_post(union_float64 a, union_float64 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float64_is_zero(a.s);
1831 }
1832 
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a, float32 b, float_status *s)
1835 {
1836     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837                         f32_div_pre, f32_div_post, NULL, NULL);
1838 }
1839 
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a, float64 b, float_status *s)
1842 {
1843     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844                         f64_div_pre, f64_div_post, NULL, NULL);
1845 }
1846 
1847 /*
1848  * Float to Float conversions
1849  *
1850  * Returns the result of converting one float format to another. The
1851  * conversion is performed according to the IEC/IEEE Standard for
1852  * Binary Floating-Point Arithmetic.
1853  *
1854  * The float_to_float helper only needs to take care of raising
1855  * invalid exceptions and handling the conversion on NaNs.
1856  */
1857 
1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859                                  float_status *s)
1860 {
1861     if (dstf->arm_althp) {
1862         switch (a.cls) {
1863         case float_class_qnan:
1864         case float_class_snan:
1865             /* There is no NaN in the destination format.  Raise Invalid
1866              * and return a zero with the sign of the input NaN.
1867              */
1868             s->float_exception_flags |= float_flag_invalid;
1869             a.cls = float_class_zero;
1870             a.frac = 0;
1871             a.exp = 0;
1872             break;
1873 
1874         case float_class_inf:
1875             /* There is no Inf in the destination format.  Raise Invalid
1876              * and return the maximum normal with the correct sign.
1877              */
1878             s->float_exception_flags |= float_flag_invalid;
1879             a.cls = float_class_normal;
1880             a.exp = dstf->exp_max;
1881             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882             break;
1883 
1884         default:
1885             break;
1886         }
1887     } else if (is_nan(a.cls)) {
1888         if (is_snan(a.cls)) {
1889             s->float_exception_flags |= float_flag_invalid;
1890             a = parts_silence_nan(a, s);
1891         }
1892         if (s->default_nan_mode) {
1893             return parts_default_nan(s);
1894         }
1895     }
1896     return a;
1897 }
1898 
1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1900 {
1901     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903     FloatParts pr = float_to_float(p, &float32_params, s);
1904     return float32_round_pack_canonical(pr, s);
1905 }
1906 
1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float64_params, s);
1912     return float64_round_pack_canonical(pr, s);
1913 }
1914 
1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float32_unpack_canonical(a, s);
1919     FloatParts pr = float_to_float(p, fmt16, s);
1920     return float16a_round_pack_canonical(pr, s, fmt16);
1921 }
1922 
1923 static float64 QEMU_SOFTFLOAT_ATTR
1924 soft_float32_to_float64(float32 a, float_status *s)
1925 {
1926     FloatParts p = float32_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float64_params, s);
1928     return float64_round_pack_canonical(pr, s);
1929 }
1930 
1931 float64 float32_to_float64(float32 a, float_status *s)
1932 {
1933     if (likely(float32_is_normal(a))) {
1934         /* Widening conversion can never produce inexact results.  */
1935         union_float32 uf;
1936         union_float64 ud;
1937         uf.s = a;
1938         ud.h = uf.h;
1939         return ud.s;
1940     } else if (float32_is_zero(a)) {
1941         return float64_set_sign(float64_zero, float32_is_neg(a));
1942     } else {
1943         return soft_float32_to_float64(a, s);
1944     }
1945 }
1946 
1947 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1948 {
1949     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1950     FloatParts p = float64_unpack_canonical(a, s);
1951     FloatParts pr = float_to_float(p, fmt16, s);
1952     return float16a_round_pack_canonical(pr, s, fmt16);
1953 }
1954 
1955 float32 float64_to_float32(float64 a, float_status *s)
1956 {
1957     FloatParts p = float64_unpack_canonical(a, s);
1958     FloatParts pr = float_to_float(p, &float32_params, s);
1959     return float32_round_pack_canonical(pr, s);
1960 }
1961 
1962 /*
1963  * Rounds the floating-point value `a' to an integer, and returns the
1964  * result as a floating-point value. The operation is performed
1965  * according to the IEC/IEEE Standard for Binary Floating-Point
1966  * Arithmetic.
1967  */
1968 
1969 static FloatParts round_to_int(FloatParts a, int rmode,
1970                                int scale, float_status *s)
1971 {
1972     switch (a.cls) {
1973     case float_class_qnan:
1974     case float_class_snan:
1975         return return_nan(a, s);
1976 
1977     case float_class_zero:
1978     case float_class_inf:
1979         /* already "integral" */
1980         break;
1981 
1982     case float_class_normal:
1983         scale = MIN(MAX(scale, -0x10000), 0x10000);
1984         a.exp += scale;
1985 
1986         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1987             /* already integral */
1988             break;
1989         }
1990         if (a.exp < 0) {
1991             bool one;
1992             /* all fractional */
1993             s->float_exception_flags |= float_flag_inexact;
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1997                 break;
1998             case float_round_ties_away:
1999                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2000                 break;
2001             case float_round_to_zero:
2002                 one = false;
2003                 break;
2004             case float_round_up:
2005                 one = !a.sign;
2006                 break;
2007             case float_round_down:
2008                 one = a.sign;
2009                 break;
2010             case float_round_to_odd:
2011                 one = true;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (one) {
2018                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2019                 a.exp = 0;
2020             } else {
2021                 a.cls = float_class_zero;
2022             }
2023         } else {
2024             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2025             uint64_t frac_lsbm1 = frac_lsb >> 1;
2026             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2027             uint64_t rnd_mask = rnd_even_mask >> 1;
2028             uint64_t inc;
2029 
2030             switch (rmode) {
2031             case float_round_nearest_even:
2032                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2033                 break;
2034             case float_round_ties_away:
2035                 inc = frac_lsbm1;
2036                 break;
2037             case float_round_to_zero:
2038                 inc = 0;
2039                 break;
2040             case float_round_up:
2041                 inc = a.sign ? 0 : rnd_mask;
2042                 break;
2043             case float_round_down:
2044                 inc = a.sign ? rnd_mask : 0;
2045                 break;
2046             case float_round_to_odd:
2047                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2048                 break;
2049             default:
2050                 g_assert_not_reached();
2051             }
2052 
2053             if (a.frac & rnd_mask) {
2054                 s->float_exception_flags |= float_flag_inexact;
2055                 a.frac += inc;
2056                 a.frac &= ~rnd_mask;
2057                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2058                     a.frac >>= 1;
2059                     a.exp++;
2060                 }
2061             }
2062         }
2063         break;
2064     default:
2065         g_assert_not_reached();
2066     }
2067     return a;
2068 }
2069 
2070 float16 float16_round_to_int(float16 a, float_status *s)
2071 {
2072     FloatParts pa = float16_unpack_canonical(a, s);
2073     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2074     return float16_round_pack_canonical(pr, s);
2075 }
2076 
2077 float32 float32_round_to_int(float32 a, float_status *s)
2078 {
2079     FloatParts pa = float32_unpack_canonical(a, s);
2080     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2081     return float32_round_pack_canonical(pr, s);
2082 }
2083 
2084 float64 float64_round_to_int(float64 a, float_status *s)
2085 {
2086     FloatParts pa = float64_unpack_canonical(a, s);
2087     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2088     return float64_round_pack_canonical(pr, s);
2089 }
2090 
2091 /*
2092  * Returns the result of converting the floating-point value `a' to
2093  * the two's complement integer format. The conversion is performed
2094  * according to the IEC/IEEE Standard for Binary Floating-Point
2095  * Arithmetic---which means in particular that the conversion is
2096  * rounded according to the current rounding mode. If `a' is a NaN,
2097  * the largest positive integer is returned. Otherwise, if the
2098  * conversion overflows, the largest integer with the same sign as `a'
2099  * is returned.
2100 */
2101 
2102 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2103                                      int64_t min, int64_t max,
2104                                      float_status *s)
2105 {
2106     uint64_t r;
2107     int orig_flags = get_float_exception_flags(s);
2108     FloatParts p = round_to_int(in, rmode, scale, s);
2109 
2110     switch (p.cls) {
2111     case float_class_snan:
2112     case float_class_qnan:
2113         s->float_exception_flags = orig_flags | float_flag_invalid;
2114         return max;
2115     case float_class_inf:
2116         s->float_exception_flags = orig_flags | float_flag_invalid;
2117         return p.sign ? min : max;
2118     case float_class_zero:
2119         return 0;
2120     case float_class_normal:
2121         if (p.exp < DECOMPOSED_BINARY_POINT) {
2122             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2123         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2124             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2125         } else {
2126             r = UINT64_MAX;
2127         }
2128         if (p.sign) {
2129             if (r <= -(uint64_t) min) {
2130                 return -r;
2131             } else {
2132                 s->float_exception_flags = orig_flags | float_flag_invalid;
2133                 return min;
2134             }
2135         } else {
2136             if (r <= max) {
2137                 return r;
2138             } else {
2139                 s->float_exception_flags = orig_flags | float_flag_invalid;
2140                 return max;
2141             }
2142         }
2143     default:
2144         g_assert_not_reached();
2145     }
2146 }
2147 
2148 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2149                                 float_status *s)
2150 {
2151     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2152                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2153 }
2154 
2155 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2156                                 float_status *s)
2157 {
2158     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2159                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2160 }
2161 
2162 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2163                                 float_status *s)
2164 {
2165     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2166                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2167 }
2168 
2169 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2170                                 float_status *s)
2171 {
2172     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2173                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2174 }
2175 
2176 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2177                                 float_status *s)
2178 {
2179     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2180                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2181 }
2182 
2183 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2184                                 float_status *s)
2185 {
2186     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2187                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2188 }
2189 
2190 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2191                                 float_status *s)
2192 {
2193     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2194                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2195 }
2196 
2197 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2198                                 float_status *s)
2199 {
2200     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2201                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2202 }
2203 
2204 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2205                                 float_status *s)
2206 {
2207     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2208                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2209 }
2210 
2211 int16_t float16_to_int16(float16 a, float_status *s)
2212 {
2213     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2214 }
2215 
2216 int32_t float16_to_int32(float16 a, float_status *s)
2217 {
2218     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2219 }
2220 
2221 int64_t float16_to_int64(float16 a, float_status *s)
2222 {
2223     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2224 }
2225 
2226 int16_t float32_to_int16(float32 a, float_status *s)
2227 {
2228     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2229 }
2230 
2231 int32_t float32_to_int32(float32 a, float_status *s)
2232 {
2233     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2234 }
2235 
2236 int64_t float32_to_int64(float32 a, float_status *s)
2237 {
2238     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2239 }
2240 
2241 int16_t float64_to_int16(float64 a, float_status *s)
2242 {
2243     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2244 }
2245 
2246 int32_t float64_to_int32(float64 a, float_status *s)
2247 {
2248     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2249 }
2250 
2251 int64_t float64_to_int64(float64 a, float_status *s)
2252 {
2253     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2254 }
2255 
2256 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2257 {
2258     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2259 }
2260 
2261 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2262 {
2263     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2264 }
2265 
2266 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2267 {
2268     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2269 }
2270 
2271 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2272 {
2273     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2274 }
2275 
2276 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2277 {
2278     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2279 }
2280 
2281 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2282 {
2283     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2284 }
2285 
2286 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2287 {
2288     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2289 }
2290 
2291 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2292 {
2293     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2294 }
2295 
2296 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2297 {
2298     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2299 }
2300 
2301 /*
2302  *  Returns the result of converting the floating-point value `a' to
2303  *  the unsigned integer format. The conversion is performed according
2304  *  to the IEC/IEEE Standard for Binary Floating-Point
2305  *  Arithmetic---which means in particular that the conversion is
2306  *  rounded according to the current rounding mode. If `a' is a NaN,
2307  *  the largest unsigned integer is returned. Otherwise, if the
2308  *  conversion overflows, the largest unsigned integer is returned. If
2309  *  the 'a' is negative, the result is rounded and zero is returned;
2310  *  values that do not round to zero will raise the inexact exception
2311  *  flag.
2312  */
2313 
2314 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2315                                        uint64_t max, float_status *s)
2316 {
2317     int orig_flags = get_float_exception_flags(s);
2318     FloatParts p = round_to_int(in, rmode, scale, s);
2319     uint64_t r;
2320 
2321     switch (p.cls) {
2322     case float_class_snan:
2323     case float_class_qnan:
2324         s->float_exception_flags = orig_flags | float_flag_invalid;
2325         return max;
2326     case float_class_inf:
2327         s->float_exception_flags = orig_flags | float_flag_invalid;
2328         return p.sign ? 0 : max;
2329     case float_class_zero:
2330         return 0;
2331     case float_class_normal:
2332         if (p.sign) {
2333             s->float_exception_flags = orig_flags | float_flag_invalid;
2334             return 0;
2335         }
2336 
2337         if (p.exp < DECOMPOSED_BINARY_POINT) {
2338             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2339         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2340             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2341         } else {
2342             s->float_exception_flags = orig_flags | float_flag_invalid;
2343             return max;
2344         }
2345 
2346         /* For uint64 this will never trip, but if p.exp is too large
2347          * to shift a decomposed fraction we shall have exited via the
2348          * 3rd leg above.
2349          */
2350         if (r > max) {
2351             s->float_exception_flags = orig_flags | float_flag_invalid;
2352             return max;
2353         }
2354         return r;
2355     default:
2356         g_assert_not_reached();
2357     }
2358 }
2359 
2360 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2364                                   rmode, scale, UINT16_MAX, s);
2365 }
2366 
2367 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2371                                   rmode, scale, UINT32_MAX, s);
2372 }
2373 
2374 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2378                                   rmode, scale, UINT64_MAX, s);
2379 }
2380 
2381 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2385                                   rmode, scale, UINT16_MAX, s);
2386 }
2387 
2388 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2389                                   float_status *s)
2390 {
2391     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2392                                   rmode, scale, UINT32_MAX, s);
2393 }
2394 
2395 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2396                                   float_status *s)
2397 {
2398     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2399                                   rmode, scale, UINT64_MAX, s);
2400 }
2401 
2402 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2403                                   float_status *s)
2404 {
2405     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2406                                   rmode, scale, UINT16_MAX, s);
2407 }
2408 
2409 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2410                                   float_status *s)
2411 {
2412     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2413                                   rmode, scale, UINT32_MAX, s);
2414 }
2415 
2416 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2417                                   float_status *s)
2418 {
2419     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2420                                   rmode, scale, UINT64_MAX, s);
2421 }
2422 
2423 uint16_t float16_to_uint16(float16 a, float_status *s)
2424 {
2425     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint32_t float16_to_uint32(float16 a, float_status *s)
2429 {
2430     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint64_t float16_to_uint64(float16 a, float_status *s)
2434 {
2435     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2436 }
2437 
2438 uint16_t float32_to_uint16(float32 a, float_status *s)
2439 {
2440     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2441 }
2442 
2443 uint32_t float32_to_uint32(float32 a, float_status *s)
2444 {
2445     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2446 }
2447 
2448 uint64_t float32_to_uint64(float32 a, float_status *s)
2449 {
2450     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2451 }
2452 
2453 uint16_t float64_to_uint16(float64 a, float_status *s)
2454 {
2455     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2456 }
2457 
2458 uint32_t float64_to_uint32(float64 a, float_status *s)
2459 {
2460     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2461 }
2462 
2463 uint64_t float64_to_uint64(float64 a, float_status *s)
2464 {
2465     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2466 }
2467 
2468 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2469 {
2470     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2474 {
2475     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2479 {
2480     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2481 }
2482 
2483 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2484 {
2485     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2486 }
2487 
2488 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2489 {
2490     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2491 }
2492 
2493 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2494 {
2495     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2496 }
2497 
2498 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2499 {
2500     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2501 }
2502 
2503 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2504 {
2505     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2506 }
2507 
2508 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2509 {
2510     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2511 }
2512 
2513 /*
2514  * Integer to float conversions
2515  *
2516  * Returns the result of converting the two's complement integer `a'
2517  * to the floating-point format. The conversion is performed according
2518  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2519  */
2520 
2521 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2522 {
2523     FloatParts r = { .sign = false };
2524 
2525     if (a == 0) {
2526         r.cls = float_class_zero;
2527     } else {
2528         uint64_t f = a;
2529         int shift;
2530 
2531         r.cls = float_class_normal;
2532         if (a < 0) {
2533             f = -f;
2534             r.sign = true;
2535         }
2536         shift = clz64(f) - 1;
2537         scale = MIN(MAX(scale, -0x10000), 0x10000);
2538 
2539         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2540         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2541     }
2542 
2543     return r;
2544 }
2545 
2546 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2547 {
2548     FloatParts pa = int_to_float(a, scale, status);
2549     return float16_round_pack_canonical(pa, status);
2550 }
2551 
2552 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2553 {
2554     return int64_to_float16_scalbn(a, scale, status);
2555 }
2556 
2557 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2558 {
2559     return int64_to_float16_scalbn(a, scale, status);
2560 }
2561 
2562 float16 int64_to_float16(int64_t a, float_status *status)
2563 {
2564     return int64_to_float16_scalbn(a, 0, status);
2565 }
2566 
2567 float16 int32_to_float16(int32_t a, float_status *status)
2568 {
2569     return int64_to_float16_scalbn(a, 0, status);
2570 }
2571 
2572 float16 int16_to_float16(int16_t a, float_status *status)
2573 {
2574     return int64_to_float16_scalbn(a, 0, status);
2575 }
2576 
2577 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2578 {
2579     FloatParts pa = int_to_float(a, scale, status);
2580     return float32_round_pack_canonical(pa, status);
2581 }
2582 
2583 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2584 {
2585     return int64_to_float32_scalbn(a, scale, status);
2586 }
2587 
2588 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2589 {
2590     return int64_to_float32_scalbn(a, scale, status);
2591 }
2592 
2593 float32 int64_to_float32(int64_t a, float_status *status)
2594 {
2595     return int64_to_float32_scalbn(a, 0, status);
2596 }
2597 
2598 float32 int32_to_float32(int32_t a, float_status *status)
2599 {
2600     return int64_to_float32_scalbn(a, 0, status);
2601 }
2602 
2603 float32 int16_to_float32(int16_t a, float_status *status)
2604 {
2605     return int64_to_float32_scalbn(a, 0, status);
2606 }
2607 
2608 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2609 {
2610     FloatParts pa = int_to_float(a, scale, status);
2611     return float64_round_pack_canonical(pa, status);
2612 }
2613 
2614 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2615 {
2616     return int64_to_float64_scalbn(a, scale, status);
2617 }
2618 
2619 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2620 {
2621     return int64_to_float64_scalbn(a, scale, status);
2622 }
2623 
2624 float64 int64_to_float64(int64_t a, float_status *status)
2625 {
2626     return int64_to_float64_scalbn(a, 0, status);
2627 }
2628 
2629 float64 int32_to_float64(int32_t a, float_status *status)
2630 {
2631     return int64_to_float64_scalbn(a, 0, status);
2632 }
2633 
2634 float64 int16_to_float64(int16_t a, float_status *status)
2635 {
2636     return int64_to_float64_scalbn(a, 0, status);
2637 }
2638 
2639 
2640 /*
2641  * Unsigned Integer to float conversions
2642  *
2643  * Returns the result of converting the unsigned integer `a' to the
2644  * floating-point format. The conversion is performed according to the
2645  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2646  */
2647 
2648 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2649 {
2650     FloatParts r = { .sign = false };
2651 
2652     if (a == 0) {
2653         r.cls = float_class_zero;
2654     } else {
2655         scale = MIN(MAX(scale, -0x10000), 0x10000);
2656         r.cls = float_class_normal;
2657         if ((int64_t)a < 0) {
2658             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2659             shift64RightJamming(a, 1, &a);
2660             r.frac = a;
2661         } else {
2662             int shift = clz64(a) - 1;
2663             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2664             r.frac = a << shift;
2665         }
2666     }
2667 
2668     return r;
2669 }
2670 
2671 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2672 {
2673     FloatParts pa = uint_to_float(a, scale, status);
2674     return float16_round_pack_canonical(pa, status);
2675 }
2676 
2677 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2678 {
2679     return uint64_to_float16_scalbn(a, scale, status);
2680 }
2681 
2682 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2683 {
2684     return uint64_to_float16_scalbn(a, scale, status);
2685 }
2686 
2687 float16 uint64_to_float16(uint64_t a, float_status *status)
2688 {
2689     return uint64_to_float16_scalbn(a, 0, status);
2690 }
2691 
2692 float16 uint32_to_float16(uint32_t a, float_status *status)
2693 {
2694     return uint64_to_float16_scalbn(a, 0, status);
2695 }
2696 
2697 float16 uint16_to_float16(uint16_t a, float_status *status)
2698 {
2699     return uint64_to_float16_scalbn(a, 0, status);
2700 }
2701 
2702 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2703 {
2704     FloatParts pa = uint_to_float(a, scale, status);
2705     return float32_round_pack_canonical(pa, status);
2706 }
2707 
2708 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2709 {
2710     return uint64_to_float32_scalbn(a, scale, status);
2711 }
2712 
2713 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2714 {
2715     return uint64_to_float32_scalbn(a, scale, status);
2716 }
2717 
2718 float32 uint64_to_float32(uint64_t a, float_status *status)
2719 {
2720     return uint64_to_float32_scalbn(a, 0, status);
2721 }
2722 
2723 float32 uint32_to_float32(uint32_t a, float_status *status)
2724 {
2725     return uint64_to_float32_scalbn(a, 0, status);
2726 }
2727 
2728 float32 uint16_to_float32(uint16_t a, float_status *status)
2729 {
2730     return uint64_to_float32_scalbn(a, 0, status);
2731 }
2732 
2733 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2734 {
2735     FloatParts pa = uint_to_float(a, scale, status);
2736     return float64_round_pack_canonical(pa, status);
2737 }
2738 
2739 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2740 {
2741     return uint64_to_float64_scalbn(a, scale, status);
2742 }
2743 
2744 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2745 {
2746     return uint64_to_float64_scalbn(a, scale, status);
2747 }
2748 
2749 float64 uint64_to_float64(uint64_t a, float_status *status)
2750 {
2751     return uint64_to_float64_scalbn(a, 0, status);
2752 }
2753 
2754 float64 uint32_to_float64(uint32_t a, float_status *status)
2755 {
2756     return uint64_to_float64_scalbn(a, 0, status);
2757 }
2758 
2759 float64 uint16_to_float64(uint16_t a, float_status *status)
2760 {
2761     return uint64_to_float64_scalbn(a, 0, status);
2762 }
2763 
2764 /* Float Min/Max */
2765 /* min() and max() functions. These can't be implemented as
2766  * 'compare and pick one input' because that would mishandle
2767  * NaNs and +0 vs -0.
2768  *
2769  * minnum() and maxnum() functions. These are similar to the min()
2770  * and max() functions but if one of the arguments is a QNaN and
2771  * the other is numerical then the numerical argument is returned.
2772  * SNaNs will get quietened before being returned.
2773  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2774  * and maxNum() operations. min() and max() are the typical min/max
2775  * semantics provided by many CPUs which predate that specification.
2776  *
2777  * minnummag() and maxnummag() functions correspond to minNumMag()
2778  * and minNumMag() from the IEEE-754 2008.
2779  */
2780 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2781                                 bool ieee, bool ismag, float_status *s)
2782 {
2783     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2784         if (ieee) {
2785             /* Takes two floating-point values `a' and `b', one of
2786              * which is a NaN, and returns the appropriate NaN
2787              * result. If either `a' or `b' is a signaling NaN,
2788              * the invalid exception is raised.
2789              */
2790             if (is_snan(a.cls) || is_snan(b.cls)) {
2791                 return pick_nan(a, b, s);
2792             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2793                 return b;
2794             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2795                 return a;
2796             }
2797         }
2798         return pick_nan(a, b, s);
2799     } else {
2800         int a_exp, b_exp;
2801 
2802         switch (a.cls) {
2803         case float_class_normal:
2804             a_exp = a.exp;
2805             break;
2806         case float_class_inf:
2807             a_exp = INT_MAX;
2808             break;
2809         case float_class_zero:
2810             a_exp = INT_MIN;
2811             break;
2812         default:
2813             g_assert_not_reached();
2814             break;
2815         }
2816         switch (b.cls) {
2817         case float_class_normal:
2818             b_exp = b.exp;
2819             break;
2820         case float_class_inf:
2821             b_exp = INT_MAX;
2822             break;
2823         case float_class_zero:
2824             b_exp = INT_MIN;
2825             break;
2826         default:
2827             g_assert_not_reached();
2828             break;
2829         }
2830 
2831         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2832             bool a_less = a_exp < b_exp;
2833             if (a_exp == b_exp) {
2834                 a_less = a.frac < b.frac;
2835             }
2836             return a_less ^ ismin ? b : a;
2837         }
2838 
2839         if (a.sign == b.sign) {
2840             bool a_less = a_exp < b_exp;
2841             if (a_exp == b_exp) {
2842                 a_less = a.frac < b.frac;
2843             }
2844             return a.sign ^ a_less ^ ismin ? b : a;
2845         } else {
2846             return a.sign ^ ismin ? b : a;
2847         }
2848     }
2849 }
2850 
2851 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2852 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2853                                      float_status *s)                   \
2854 {                                                                       \
2855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2857     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2858                                                                         \
2859     return float ## sz ## _round_pack_canonical(pr, s);                 \
2860 }
2861 
2862 MINMAX(16, min, true, false, false)
2863 MINMAX(16, minnum, true, true, false)
2864 MINMAX(16, minnummag, true, true, true)
2865 MINMAX(16, max, false, false, false)
2866 MINMAX(16, maxnum, false, true, false)
2867 MINMAX(16, maxnummag, false, true, true)
2868 
2869 MINMAX(32, min, true, false, false)
2870 MINMAX(32, minnum, true, true, false)
2871 MINMAX(32, minnummag, true, true, true)
2872 MINMAX(32, max, false, false, false)
2873 MINMAX(32, maxnum, false, true, false)
2874 MINMAX(32, maxnummag, false, true, true)
2875 
2876 MINMAX(64, min, true, false, false)
2877 MINMAX(64, minnum, true, true, false)
2878 MINMAX(64, minnummag, true, true, true)
2879 MINMAX(64, max, false, false, false)
2880 MINMAX(64, maxnum, false, true, false)
2881 MINMAX(64, maxnummag, false, true, true)
2882 
2883 #undef MINMAX
2884 
2885 /* Floating point compare */
2886 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2887                           float_status *s)
2888 {
2889     if (is_nan(a.cls) || is_nan(b.cls)) {
2890         if (!is_quiet ||
2891             a.cls == float_class_snan ||
2892             b.cls == float_class_snan) {
2893             s->float_exception_flags |= float_flag_invalid;
2894         }
2895         return float_relation_unordered;
2896     }
2897 
2898     if (a.cls == float_class_zero) {
2899         if (b.cls == float_class_zero) {
2900             return float_relation_equal;
2901         }
2902         return b.sign ? float_relation_greater : float_relation_less;
2903     } else if (b.cls == float_class_zero) {
2904         return a.sign ? float_relation_less : float_relation_greater;
2905     }
2906 
2907     /* The only really important thing about infinity is its sign. If
2908      * both are infinities the sign marks the smallest of the two.
2909      */
2910     if (a.cls == float_class_inf) {
2911         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2912             return float_relation_equal;
2913         }
2914         return a.sign ? float_relation_less : float_relation_greater;
2915     } else if (b.cls == float_class_inf) {
2916         return b.sign ? float_relation_greater : float_relation_less;
2917     }
2918 
2919     if (a.sign != b.sign) {
2920         return a.sign ? float_relation_less : float_relation_greater;
2921     }
2922 
2923     if (a.exp == b.exp) {
2924         if (a.frac == b.frac) {
2925             return float_relation_equal;
2926         }
2927         if (a.sign) {
2928             return a.frac > b.frac ?
2929                 float_relation_less : float_relation_greater;
2930         } else {
2931             return a.frac > b.frac ?
2932                 float_relation_greater : float_relation_less;
2933         }
2934     } else {
2935         if (a.sign) {
2936             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2937         } else {
2938             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2939         }
2940     }
2941 }
2942 
2943 #define COMPARE(name, attr, sz)                                         \
2944 static int attr                                                         \
2945 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2946 {                                                                       \
2947     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2948     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2949     return compare_floats(pa, pb, is_quiet, s);                         \
2950 }
2951 
2952 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2953 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2954 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2955 
2956 #undef COMPARE
2957 
2958 int float16_compare(float16 a, float16 b, float_status *s)
2959 {
2960     return soft_f16_compare(a, b, false, s);
2961 }
2962 
2963 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2964 {
2965     return soft_f16_compare(a, b, true, s);
2966 }
2967 
2968 static int QEMU_FLATTEN
2969 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2970 {
2971     union_float32 ua, ub;
2972 
2973     ua.s = xa;
2974     ub.s = xb;
2975 
2976     if (QEMU_NO_HARDFLOAT) {
2977         goto soft;
2978     }
2979 
2980     float32_input_flush2(&ua.s, &ub.s, s);
2981     if (isgreaterequal(ua.h, ub.h)) {
2982         if (isgreater(ua.h, ub.h)) {
2983             return float_relation_greater;
2984         }
2985         return float_relation_equal;
2986     }
2987     if (likely(isless(ua.h, ub.h))) {
2988         return float_relation_less;
2989     }
2990     /* The only condition remaining is unordered.
2991      * Fall through to set flags.
2992      */
2993  soft:
2994     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2995 }
2996 
2997 int float32_compare(float32 a, float32 b, float_status *s)
2998 {
2999     return f32_compare(a, b, false, s);
3000 }
3001 
3002 int float32_compare_quiet(float32 a, float32 b, float_status *s)
3003 {
3004     return f32_compare(a, b, true, s);
3005 }
3006 
3007 static int QEMU_FLATTEN
3008 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3009 {
3010     union_float64 ua, ub;
3011 
3012     ua.s = xa;
3013     ub.s = xb;
3014 
3015     if (QEMU_NO_HARDFLOAT) {
3016         goto soft;
3017     }
3018 
3019     float64_input_flush2(&ua.s, &ub.s, s);
3020     if (isgreaterequal(ua.h, ub.h)) {
3021         if (isgreater(ua.h, ub.h)) {
3022             return float_relation_greater;
3023         }
3024         return float_relation_equal;
3025     }
3026     if (likely(isless(ua.h, ub.h))) {
3027         return float_relation_less;
3028     }
3029     /* The only condition remaining is unordered.
3030      * Fall through to set flags.
3031      */
3032  soft:
3033     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3034 }
3035 
3036 int float64_compare(float64 a, float64 b, float_status *s)
3037 {
3038     return f64_compare(a, b, false, s);
3039 }
3040 
3041 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3042 {
3043     return f64_compare(a, b, true, s);
3044 }
3045 
3046 /* Multiply A by 2 raised to the power N.  */
3047 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3048 {
3049     if (unlikely(is_nan(a.cls))) {
3050         return return_nan(a, s);
3051     }
3052     if (a.cls == float_class_normal) {
3053         /* The largest float type (even though not supported by FloatParts)
3054          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3055          * still allows rounding to infinity, without allowing overflow
3056          * within the int32_t that backs FloatParts.exp.
3057          */
3058         n = MIN(MAX(n, -0x10000), 0x10000);
3059         a.exp += n;
3060     }
3061     return a;
3062 }
3063 
3064 float16 float16_scalbn(float16 a, int n, float_status *status)
3065 {
3066     FloatParts pa = float16_unpack_canonical(a, status);
3067     FloatParts pr = scalbn_decomposed(pa, n, status);
3068     return float16_round_pack_canonical(pr, status);
3069 }
3070 
3071 float32 float32_scalbn(float32 a, int n, float_status *status)
3072 {
3073     FloatParts pa = float32_unpack_canonical(a, status);
3074     FloatParts pr = scalbn_decomposed(pa, n, status);
3075     return float32_round_pack_canonical(pr, status);
3076 }
3077 
3078 float64 float64_scalbn(float64 a, int n, float_status *status)
3079 {
3080     FloatParts pa = float64_unpack_canonical(a, status);
3081     FloatParts pr = scalbn_decomposed(pa, n, status);
3082     return float64_round_pack_canonical(pr, status);
3083 }
3084 
3085 /*
3086  * Square Root
3087  *
3088  * The old softfloat code did an approximation step before zeroing in
3089  * on the final result. However for simpleness we just compute the
3090  * square root by iterating down from the implicit bit to enough extra
3091  * bits to ensure we get a correctly rounded result.
3092  *
3093  * This does mean however the calculation is slower than before,
3094  * especially for 64 bit floats.
3095  */
3096 
3097 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3098 {
3099     uint64_t a_frac, r_frac, s_frac;
3100     int bit, last_bit;
3101 
3102     if (is_nan(a.cls)) {
3103         return return_nan(a, s);
3104     }
3105     if (a.cls == float_class_zero) {
3106         return a;  /* sqrt(+-0) = +-0 */
3107     }
3108     if (a.sign) {
3109         s->float_exception_flags |= float_flag_invalid;
3110         return parts_default_nan(s);
3111     }
3112     if (a.cls == float_class_inf) {
3113         return a;  /* sqrt(+inf) = +inf */
3114     }
3115 
3116     assert(a.cls == float_class_normal);
3117 
3118     /* We need two overflow bits at the top. Adding room for that is a
3119      * right shift. If the exponent is odd, we can discard the low bit
3120      * by multiplying the fraction by 2; that's a left shift. Combine
3121      * those and we shift right if the exponent is even.
3122      */
3123     a_frac = a.frac;
3124     if (!(a.exp & 1)) {
3125         a_frac >>= 1;
3126     }
3127     a.exp >>= 1;
3128 
3129     /* Bit-by-bit computation of sqrt.  */
3130     r_frac = 0;
3131     s_frac = 0;
3132 
3133     /* Iterate from implicit bit down to the 3 extra bits to compute a
3134      * properly rounded result. Remember we've inserted one more bit
3135      * at the top, so these positions are one less.
3136      */
3137     bit = DECOMPOSED_BINARY_POINT - 1;
3138     last_bit = MAX(p->frac_shift - 4, 0);
3139     do {
3140         uint64_t q = 1ULL << bit;
3141         uint64_t t_frac = s_frac + q;
3142         if (t_frac <= a_frac) {
3143             s_frac = t_frac + q;
3144             a_frac -= t_frac;
3145             r_frac += q;
3146         }
3147         a_frac <<= 1;
3148     } while (--bit >= last_bit);
3149 
3150     /* Undo the right shift done above. If there is any remaining
3151      * fraction, the result is inexact. Set the sticky bit.
3152      */
3153     a.frac = (r_frac << 1) + (a_frac != 0);
3154 
3155     return a;
3156 }
3157 
3158 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3159 {
3160     FloatParts pa = float16_unpack_canonical(a, status);
3161     FloatParts pr = sqrt_float(pa, status, &float16_params);
3162     return float16_round_pack_canonical(pr, status);
3163 }
3164 
3165 static float32 QEMU_SOFTFLOAT_ATTR
3166 soft_f32_sqrt(float32 a, float_status *status)
3167 {
3168     FloatParts pa = float32_unpack_canonical(a, status);
3169     FloatParts pr = sqrt_float(pa, status, &float32_params);
3170     return float32_round_pack_canonical(pr, status);
3171 }
3172 
3173 static float64 QEMU_SOFTFLOAT_ATTR
3174 soft_f64_sqrt(float64 a, float_status *status)
3175 {
3176     FloatParts pa = float64_unpack_canonical(a, status);
3177     FloatParts pr = sqrt_float(pa, status, &float64_params);
3178     return float64_round_pack_canonical(pr, status);
3179 }
3180 
3181 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3182 {
3183     union_float32 ua, ur;
3184 
3185     ua.s = xa;
3186     if (unlikely(!can_use_fpu(s))) {
3187         goto soft;
3188     }
3189 
3190     float32_input_flush1(&ua.s, s);
3191     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3192         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3193                        fpclassify(ua.h) == FP_ZERO) ||
3194                      signbit(ua.h))) {
3195             goto soft;
3196         }
3197     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3198                         float32_is_neg(ua.s))) {
3199         goto soft;
3200     }
3201     ur.h = sqrtf(ua.h);
3202     return ur.s;
3203 
3204  soft:
3205     return soft_f32_sqrt(ua.s, s);
3206 }
3207 
3208 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3209 {
3210     union_float64 ua, ur;
3211 
3212     ua.s = xa;
3213     if (unlikely(!can_use_fpu(s))) {
3214         goto soft;
3215     }
3216 
3217     float64_input_flush1(&ua.s, s);
3218     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3219         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3220                        fpclassify(ua.h) == FP_ZERO) ||
3221                      signbit(ua.h))) {
3222             goto soft;
3223         }
3224     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3225                         float64_is_neg(ua.s))) {
3226         goto soft;
3227     }
3228     ur.h = sqrt(ua.h);
3229     return ur.s;
3230 
3231  soft:
3232     return soft_f64_sqrt(ua.s, s);
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | The pattern for a default generated NaN.
3237 *----------------------------------------------------------------------------*/
3238 
3239 float16 float16_default_nan(float_status *status)
3240 {
3241     FloatParts p = parts_default_nan(status);
3242     p.frac >>= float16_params.frac_shift;
3243     return float16_pack_raw(p);
3244 }
3245 
3246 float32 float32_default_nan(float_status *status)
3247 {
3248     FloatParts p = parts_default_nan(status);
3249     p.frac >>= float32_params.frac_shift;
3250     return float32_pack_raw(p);
3251 }
3252 
3253 float64 float64_default_nan(float_status *status)
3254 {
3255     FloatParts p = parts_default_nan(status);
3256     p.frac >>= float64_params.frac_shift;
3257     return float64_pack_raw(p);
3258 }
3259 
3260 float128 float128_default_nan(float_status *status)
3261 {
3262     FloatParts p = parts_default_nan(status);
3263     float128 r;
3264 
3265     /* Extrapolate from the choices made by parts_default_nan to fill
3266      * in the quad-floating format.  If the low bit is set, assume we
3267      * want to set all non-snan bits.
3268      */
3269     r.low = -(p.frac & 1);
3270     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3271     r.high |= UINT64_C(0x7FFF000000000000);
3272     r.high |= (uint64_t)p.sign << 63;
3273 
3274     return r;
3275 }
3276 
3277 /*----------------------------------------------------------------------------
3278 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3279 *----------------------------------------------------------------------------*/
3280 
3281 float16 float16_silence_nan(float16 a, float_status *status)
3282 {
3283     FloatParts p = float16_unpack_raw(a);
3284     p.frac <<= float16_params.frac_shift;
3285     p = parts_silence_nan(p, status);
3286     p.frac >>= float16_params.frac_shift;
3287     return float16_pack_raw(p);
3288 }
3289 
3290 float32 float32_silence_nan(float32 a, float_status *status)
3291 {
3292     FloatParts p = float32_unpack_raw(a);
3293     p.frac <<= float32_params.frac_shift;
3294     p = parts_silence_nan(p, status);
3295     p.frac >>= float32_params.frac_shift;
3296     return float32_pack_raw(p);
3297 }
3298 
3299 float64 float64_silence_nan(float64 a, float_status *status)
3300 {
3301     FloatParts p = float64_unpack_raw(a);
3302     p.frac <<= float64_params.frac_shift;
3303     p = parts_silence_nan(p, status);
3304     p.frac >>= float64_params.frac_shift;
3305     return float64_pack_raw(p);
3306 }
3307 
3308 
3309 /*----------------------------------------------------------------------------
3310 | If `a' is denormal and we are in flush-to-zero mode then set the
3311 | input-denormal exception and return zero. Otherwise just return the value.
3312 *----------------------------------------------------------------------------*/
3313 
3314 static bool parts_squash_denormal(FloatParts p, float_status *status)
3315 {
3316     if (p.exp == 0 && p.frac != 0) {
3317         float_raise(float_flag_input_denormal, status);
3318         return true;
3319     }
3320 
3321     return false;
3322 }
3323 
3324 float16 float16_squash_input_denormal(float16 a, float_status *status)
3325 {
3326     if (status->flush_inputs_to_zero) {
3327         FloatParts p = float16_unpack_raw(a);
3328         if (parts_squash_denormal(p, status)) {
3329             return float16_set_sign(float16_zero, p.sign);
3330         }
3331     }
3332     return a;
3333 }
3334 
3335 float32 float32_squash_input_denormal(float32 a, float_status *status)
3336 {
3337     if (status->flush_inputs_to_zero) {
3338         FloatParts p = float32_unpack_raw(a);
3339         if (parts_squash_denormal(p, status)) {
3340             return float32_set_sign(float32_zero, p.sign);
3341         }
3342     }
3343     return a;
3344 }
3345 
3346 float64 float64_squash_input_denormal(float64 a, float_status *status)
3347 {
3348     if (status->flush_inputs_to_zero) {
3349         FloatParts p = float64_unpack_raw(a);
3350         if (parts_squash_denormal(p, status)) {
3351             return float64_set_sign(float64_zero, p.sign);
3352         }
3353     }
3354     return a;
3355 }
3356 
3357 /*----------------------------------------------------------------------------
3358 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3359 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3360 | input.  If `zSign' is 1, the input is negated before being converted to an
3361 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3362 | is simply rounded to an integer, with the inexact exception raised if the
3363 | input cannot be represented exactly as an integer.  However, if the fixed-
3364 | point input is too large, the invalid exception is raised and the largest
3365 | positive or negative integer is returned.
3366 *----------------------------------------------------------------------------*/
3367 
3368 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3369 {
3370     int8_t roundingMode;
3371     flag roundNearestEven;
3372     int8_t roundIncrement, roundBits;
3373     int32_t z;
3374 
3375     roundingMode = status->float_rounding_mode;
3376     roundNearestEven = ( roundingMode == float_round_nearest_even );
3377     switch (roundingMode) {
3378     case float_round_nearest_even:
3379     case float_round_ties_away:
3380         roundIncrement = 0x40;
3381         break;
3382     case float_round_to_zero:
3383         roundIncrement = 0;
3384         break;
3385     case float_round_up:
3386         roundIncrement = zSign ? 0 : 0x7f;
3387         break;
3388     case float_round_down:
3389         roundIncrement = zSign ? 0x7f : 0;
3390         break;
3391     case float_round_to_odd:
3392         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3393         break;
3394     default:
3395         abort();
3396     }
3397     roundBits = absZ & 0x7F;
3398     absZ = ( absZ + roundIncrement )>>7;
3399     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3400     z = absZ;
3401     if ( zSign ) z = - z;
3402     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3403         float_raise(float_flag_invalid, status);
3404         return zSign ? INT32_MIN : INT32_MAX;
3405     }
3406     if (roundBits) {
3407         status->float_exception_flags |= float_flag_inexact;
3408     }
3409     return z;
3410 
3411 }
3412 
3413 /*----------------------------------------------------------------------------
3414 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3415 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3416 | and returns the properly rounded 64-bit integer corresponding to the input.
3417 | If `zSign' is 1, the input is negated before being converted to an integer.
3418 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3419 | the inexact exception raised if the input cannot be represented exactly as
3420 | an integer.  However, if the fixed-point input is too large, the invalid
3421 | exception is raised and the largest positive or negative integer is
3422 | returned.
3423 *----------------------------------------------------------------------------*/
3424 
3425 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3426                                float_status *status)
3427 {
3428     int8_t roundingMode;
3429     flag roundNearestEven, increment;
3430     int64_t z;
3431 
3432     roundingMode = status->float_rounding_mode;
3433     roundNearestEven = ( roundingMode == float_round_nearest_even );
3434     switch (roundingMode) {
3435     case float_round_nearest_even:
3436     case float_round_ties_away:
3437         increment = ((int64_t) absZ1 < 0);
3438         break;
3439     case float_round_to_zero:
3440         increment = 0;
3441         break;
3442     case float_round_up:
3443         increment = !zSign && absZ1;
3444         break;
3445     case float_round_down:
3446         increment = zSign && absZ1;
3447         break;
3448     case float_round_to_odd:
3449         increment = !(absZ0 & 1) && absZ1;
3450         break;
3451     default:
3452         abort();
3453     }
3454     if ( increment ) {
3455         ++absZ0;
3456         if ( absZ0 == 0 ) goto overflow;
3457         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3458     }
3459     z = absZ0;
3460     if ( zSign ) z = - z;
3461     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3462  overflow:
3463         float_raise(float_flag_invalid, status);
3464         return zSign ? INT64_MIN : INT64_MAX;
3465     }
3466     if (absZ1) {
3467         status->float_exception_flags |= float_flag_inexact;
3468     }
3469     return z;
3470 
3471 }
3472 
3473 /*----------------------------------------------------------------------------
3474 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3475 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3476 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3477 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3478 | with the inexact exception raised if the input cannot be represented exactly
3479 | as an integer.  However, if the fixed-point input is too large, the invalid
3480 | exception is raised and the largest unsigned integer is returned.
3481 *----------------------------------------------------------------------------*/
3482 
3483 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3484                                 uint64_t absZ1, float_status *status)
3485 {
3486     int8_t roundingMode;
3487     flag roundNearestEven, increment;
3488 
3489     roundingMode = status->float_rounding_mode;
3490     roundNearestEven = (roundingMode == float_round_nearest_even);
3491     switch (roundingMode) {
3492     case float_round_nearest_even:
3493     case float_round_ties_away:
3494         increment = ((int64_t)absZ1 < 0);
3495         break;
3496     case float_round_to_zero:
3497         increment = 0;
3498         break;
3499     case float_round_up:
3500         increment = !zSign && absZ1;
3501         break;
3502     case float_round_down:
3503         increment = zSign && absZ1;
3504         break;
3505     case float_round_to_odd:
3506         increment = !(absZ0 & 1) && absZ1;
3507         break;
3508     default:
3509         abort();
3510     }
3511     if (increment) {
3512         ++absZ0;
3513         if (absZ0 == 0) {
3514             float_raise(float_flag_invalid, status);
3515             return UINT64_MAX;
3516         }
3517         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3518     }
3519 
3520     if (zSign && absZ0) {
3521         float_raise(float_flag_invalid, status);
3522         return 0;
3523     }
3524 
3525     if (absZ1) {
3526         status->float_exception_flags |= float_flag_inexact;
3527     }
3528     return absZ0;
3529 }
3530 
3531 /*----------------------------------------------------------------------------
3532 | Normalizes the subnormal single-precision floating-point value represented
3533 | by the denormalized significand `aSig'.  The normalized exponent and
3534 | significand are stored at the locations pointed to by `zExpPtr' and
3535 | `zSigPtr', respectively.
3536 *----------------------------------------------------------------------------*/
3537 
3538 static void
3539  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3540 {
3541     int8_t shiftCount;
3542 
3543     shiftCount = clz32(aSig) - 8;
3544     *zSigPtr = aSig<<shiftCount;
3545     *zExpPtr = 1 - shiftCount;
3546 
3547 }
3548 
3549 /*----------------------------------------------------------------------------
3550 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3551 | and significand `zSig', and returns the proper single-precision floating-
3552 | point value corresponding to the abstract input.  Ordinarily, the abstract
3553 | value is simply rounded and packed into the single-precision format, with
3554 | the inexact exception raised if the abstract input cannot be represented
3555 | exactly.  However, if the abstract value is too large, the overflow and
3556 | inexact exceptions are raised and an infinity or maximal finite value is
3557 | returned.  If the abstract value is too small, the input value is rounded to
3558 | a subnormal number, and the underflow and inexact exceptions are raised if
3559 | the abstract input cannot be represented exactly as a subnormal single-
3560 | precision floating-point number.
3561 |     The input significand `zSig' has its binary point between bits 30
3562 | and 29, which is 7 bits to the left of the usual location.  This shifted
3563 | significand must be normalized or smaller.  If `zSig' is not normalized,
3564 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3565 | and it must not require rounding.  In the usual case that `zSig' is
3566 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3567 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3568 | Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3570 
3571 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3572                                    float_status *status)
3573 {
3574     int8_t roundingMode;
3575     flag roundNearestEven;
3576     int8_t roundIncrement, roundBits;
3577     flag isTiny;
3578 
3579     roundingMode = status->float_rounding_mode;
3580     roundNearestEven = ( roundingMode == float_round_nearest_even );
3581     switch (roundingMode) {
3582     case float_round_nearest_even:
3583     case float_round_ties_away:
3584         roundIncrement = 0x40;
3585         break;
3586     case float_round_to_zero:
3587         roundIncrement = 0;
3588         break;
3589     case float_round_up:
3590         roundIncrement = zSign ? 0 : 0x7f;
3591         break;
3592     case float_round_down:
3593         roundIncrement = zSign ? 0x7f : 0;
3594         break;
3595     case float_round_to_odd:
3596         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3597         break;
3598     default:
3599         abort();
3600         break;
3601     }
3602     roundBits = zSig & 0x7F;
3603     if ( 0xFD <= (uint16_t) zExp ) {
3604         if (    ( 0xFD < zExp )
3605              || (    ( zExp == 0xFD )
3606                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3607            ) {
3608             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3609                                    roundIncrement != 0;
3610             float_raise(float_flag_overflow | float_flag_inexact, status);
3611             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3612         }
3613         if ( zExp < 0 ) {
3614             if (status->flush_to_zero) {
3615                 float_raise(float_flag_output_denormal, status);
3616                 return packFloat32(zSign, 0, 0);
3617             }
3618             isTiny =
3619                 (status->float_detect_tininess
3620                  == float_tininess_before_rounding)
3621                 || ( zExp < -1 )
3622                 || ( zSig + roundIncrement < 0x80000000 );
3623             shift32RightJamming( zSig, - zExp, &zSig );
3624             zExp = 0;
3625             roundBits = zSig & 0x7F;
3626             if (isTiny && roundBits) {
3627                 float_raise(float_flag_underflow, status);
3628             }
3629             if (roundingMode == float_round_to_odd) {
3630                 /*
3631                  * For round-to-odd case, the roundIncrement depends on
3632                  * zSig which just changed.
3633                  */
3634                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3635             }
3636         }
3637     }
3638     if (roundBits) {
3639         status->float_exception_flags |= float_flag_inexact;
3640     }
3641     zSig = ( zSig + roundIncrement )>>7;
3642     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3643     if ( zSig == 0 ) zExp = 0;
3644     return packFloat32( zSign, zExp, zSig );
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650 | and significand `zSig', and returns the proper single-precision floating-
3651 | point value corresponding to the abstract input.  This routine is just like
3652 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3653 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654 | floating-point exponent.
3655 *----------------------------------------------------------------------------*/
3656 
3657 static float32
3658  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3659                               float_status *status)
3660 {
3661     int8_t shiftCount;
3662 
3663     shiftCount = clz32(zSig) - 1;
3664     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3665                                status);
3666 
3667 }
3668 
3669 /*----------------------------------------------------------------------------
3670 | Normalizes the subnormal double-precision floating-point value represented
3671 | by the denormalized significand `aSig'.  The normalized exponent and
3672 | significand are stored at the locations pointed to by `zExpPtr' and
3673 | `zSigPtr', respectively.
3674 *----------------------------------------------------------------------------*/
3675 
3676 static void
3677  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3678 {
3679     int8_t shiftCount;
3680 
3681     shiftCount = clz64(aSig) - 11;
3682     *zSigPtr = aSig<<shiftCount;
3683     *zExpPtr = 1 - shiftCount;
3684 
3685 }
3686 
3687 /*----------------------------------------------------------------------------
3688 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3689 | double-precision floating-point value, returning the result.  After being
3690 | shifted into the proper positions, the three fields are simply added
3691 | together to form the result.  This means that any integer portion of `zSig'
3692 | will be added into the exponent.  Since a properly normalized significand
3693 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3694 | than the desired result exponent whenever `zSig' is a complete, normalized
3695 | significand.
3696 *----------------------------------------------------------------------------*/
3697 
3698 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3699 {
3700 
3701     return make_float64(
3702         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3703 
3704 }
3705 
3706 /*----------------------------------------------------------------------------
3707 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3708 | and significand `zSig', and returns the proper double-precision floating-
3709 | point value corresponding to the abstract input.  Ordinarily, the abstract
3710 | value is simply rounded and packed into the double-precision format, with
3711 | the inexact exception raised if the abstract input cannot be represented
3712 | exactly.  However, if the abstract value is too large, the overflow and
3713 | inexact exceptions are raised and an infinity or maximal finite value is
3714 | returned.  If the abstract value is too small, the input value is rounded to
3715 | a subnormal number, and the underflow and inexact exceptions are raised if
3716 | the abstract input cannot be represented exactly as a subnormal double-
3717 | precision floating-point number.
3718 |     The input significand `zSig' has its binary point between bits 62
3719 | and 61, which is 10 bits to the left of the usual location.  This shifted
3720 | significand must be normalized or smaller.  If `zSig' is not normalized,
3721 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3722 | and it must not require rounding.  In the usual case that `zSig' is
3723 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3724 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3725 | Binary Floating-Point Arithmetic.
3726 *----------------------------------------------------------------------------*/
3727 
3728 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3729                                    float_status *status)
3730 {
3731     int8_t roundingMode;
3732     flag roundNearestEven;
3733     int roundIncrement, roundBits;
3734     flag isTiny;
3735 
3736     roundingMode = status->float_rounding_mode;
3737     roundNearestEven = ( roundingMode == float_round_nearest_even );
3738     switch (roundingMode) {
3739     case float_round_nearest_even:
3740     case float_round_ties_away:
3741         roundIncrement = 0x200;
3742         break;
3743     case float_round_to_zero:
3744         roundIncrement = 0;
3745         break;
3746     case float_round_up:
3747         roundIncrement = zSign ? 0 : 0x3ff;
3748         break;
3749     case float_round_down:
3750         roundIncrement = zSign ? 0x3ff : 0;
3751         break;
3752     case float_round_to_odd:
3753         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3754         break;
3755     default:
3756         abort();
3757     }
3758     roundBits = zSig & 0x3FF;
3759     if ( 0x7FD <= (uint16_t) zExp ) {
3760         if (    ( 0x7FD < zExp )
3761              || (    ( zExp == 0x7FD )
3762                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3763            ) {
3764             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3765                                    roundIncrement != 0;
3766             float_raise(float_flag_overflow | float_flag_inexact, status);
3767             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3768         }
3769         if ( zExp < 0 ) {
3770             if (status->flush_to_zero) {
3771                 float_raise(float_flag_output_denormal, status);
3772                 return packFloat64(zSign, 0, 0);
3773             }
3774             isTiny =
3775                    (status->float_detect_tininess
3776                     == float_tininess_before_rounding)
3777                 || ( zExp < -1 )
3778                 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3779             shift64RightJamming( zSig, - zExp, &zSig );
3780             zExp = 0;
3781             roundBits = zSig & 0x3FF;
3782             if (isTiny && roundBits) {
3783                 float_raise(float_flag_underflow, status);
3784             }
3785             if (roundingMode == float_round_to_odd) {
3786                 /*
3787                  * For round-to-odd case, the roundIncrement depends on
3788                  * zSig which just changed.
3789                  */
3790                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3791             }
3792         }
3793     }
3794     if (roundBits) {
3795         status->float_exception_flags |= float_flag_inexact;
3796     }
3797     zSig = ( zSig + roundIncrement )>>10;
3798     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3799     if ( zSig == 0 ) zExp = 0;
3800     return packFloat64( zSign, zExp, zSig );
3801 
3802 }
3803 
3804 /*----------------------------------------------------------------------------
3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806 | and significand `zSig', and returns the proper double-precision floating-
3807 | point value corresponding to the abstract input.  This routine is just like
3808 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3809 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3810 | floating-point exponent.
3811 *----------------------------------------------------------------------------*/
3812 
3813 static float64
3814  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3815                               float_status *status)
3816 {
3817     int8_t shiftCount;
3818 
3819     shiftCount = clz64(zSig) - 1;
3820     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3821                                status);
3822 
3823 }
3824 
3825 /*----------------------------------------------------------------------------
3826 | Normalizes the subnormal extended double-precision floating-point value
3827 | represented by the denormalized significand `aSig'.  The normalized exponent
3828 | and significand are stored at the locations pointed to by `zExpPtr' and
3829 | `zSigPtr', respectively.
3830 *----------------------------------------------------------------------------*/
3831 
3832 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3833                                 uint64_t *zSigPtr)
3834 {
3835     int8_t shiftCount;
3836 
3837     shiftCount = clz64(aSig);
3838     *zSigPtr = aSig<<shiftCount;
3839     *zExpPtr = 1 - shiftCount;
3840 }
3841 
3842 /*----------------------------------------------------------------------------
3843 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3844 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3845 | and returns the proper extended double-precision floating-point value
3846 | corresponding to the abstract input.  Ordinarily, the abstract value is
3847 | rounded and packed into the extended double-precision format, with the
3848 | inexact exception raised if the abstract input cannot be represented
3849 | exactly.  However, if the abstract value is too large, the overflow and
3850 | inexact exceptions are raised and an infinity or maximal finite value is
3851 | returned.  If the abstract value is too small, the input value is rounded to
3852 | a subnormal number, and the underflow and inexact exceptions are raised if
3853 | the abstract input cannot be represented exactly as a subnormal extended
3854 | double-precision floating-point number.
3855 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3856 | number of bits as single or double precision, respectively.  Otherwise, the
3857 | result is rounded to the full precision of the extended double-precision
3858 | format.
3859 |     The input significand must be normalized or smaller.  If the input
3860 | significand is not normalized, `zExp' must be 0; in that case, the result
3861 | returned is a subnormal number, and it must not require rounding.  The
3862 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3863 | Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3865 
3866 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3867                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3868                               float_status *status)
3869 {
3870     int8_t roundingMode;
3871     flag roundNearestEven, increment, isTiny;
3872     int64_t roundIncrement, roundMask, roundBits;
3873 
3874     roundingMode = status->float_rounding_mode;
3875     roundNearestEven = ( roundingMode == float_round_nearest_even );
3876     if ( roundingPrecision == 80 ) goto precision80;
3877     if ( roundingPrecision == 64 ) {
3878         roundIncrement = UINT64_C(0x0000000000000400);
3879         roundMask = UINT64_C(0x00000000000007FF);
3880     }
3881     else if ( roundingPrecision == 32 ) {
3882         roundIncrement = UINT64_C(0x0000008000000000);
3883         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3884     }
3885     else {
3886         goto precision80;
3887     }
3888     zSig0 |= ( zSig1 != 0 );
3889     switch (roundingMode) {
3890     case float_round_nearest_even:
3891     case float_round_ties_away:
3892         break;
3893     case float_round_to_zero:
3894         roundIncrement = 0;
3895         break;
3896     case float_round_up:
3897         roundIncrement = zSign ? 0 : roundMask;
3898         break;
3899     case float_round_down:
3900         roundIncrement = zSign ? roundMask : 0;
3901         break;
3902     default:
3903         abort();
3904     }
3905     roundBits = zSig0 & roundMask;
3906     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3907         if (    ( 0x7FFE < zExp )
3908              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3909            ) {
3910             goto overflow;
3911         }
3912         if ( zExp <= 0 ) {
3913             if (status->flush_to_zero) {
3914                 float_raise(float_flag_output_denormal, status);
3915                 return packFloatx80(zSign, 0, 0);
3916             }
3917             isTiny =
3918                    (status->float_detect_tininess
3919                     == float_tininess_before_rounding)
3920                 || ( zExp < 0 )
3921                 || ( zSig0 <= zSig0 + roundIncrement );
3922             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3923             zExp = 0;
3924             roundBits = zSig0 & roundMask;
3925             if (isTiny && roundBits) {
3926                 float_raise(float_flag_underflow, status);
3927             }
3928             if (roundBits) {
3929                 status->float_exception_flags |= float_flag_inexact;
3930             }
3931             zSig0 += roundIncrement;
3932             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3933             roundIncrement = roundMask + 1;
3934             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935                 roundMask |= roundIncrement;
3936             }
3937             zSig0 &= ~ roundMask;
3938             return packFloatx80( zSign, zExp, zSig0 );
3939         }
3940     }
3941     if (roundBits) {
3942         status->float_exception_flags |= float_flag_inexact;
3943     }
3944     zSig0 += roundIncrement;
3945     if ( zSig0 < roundIncrement ) {
3946         ++zExp;
3947         zSig0 = UINT64_C(0x8000000000000000);
3948     }
3949     roundIncrement = roundMask + 1;
3950     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3951         roundMask |= roundIncrement;
3952     }
3953     zSig0 &= ~ roundMask;
3954     if ( zSig0 == 0 ) zExp = 0;
3955     return packFloatx80( zSign, zExp, zSig0 );
3956  precision80:
3957     switch (roundingMode) {
3958     case float_round_nearest_even:
3959     case float_round_ties_away:
3960         increment = ((int64_t)zSig1 < 0);
3961         break;
3962     case float_round_to_zero:
3963         increment = 0;
3964         break;
3965     case float_round_up:
3966         increment = !zSign && zSig1;
3967         break;
3968     case float_round_down:
3969         increment = zSign && zSig1;
3970         break;
3971     default:
3972         abort();
3973     }
3974     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3975         if (    ( 0x7FFE < zExp )
3976              || (    ( zExp == 0x7FFE )
3977                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3978                   && increment
3979                 )
3980            ) {
3981             roundMask = 0;
3982  overflow:
3983             float_raise(float_flag_overflow | float_flag_inexact, status);
3984             if (    ( roundingMode == float_round_to_zero )
3985                  || ( zSign && ( roundingMode == float_round_up ) )
3986                  || ( ! zSign && ( roundingMode == float_round_down ) )
3987                ) {
3988                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3989             }
3990             return packFloatx80(zSign,
3991                                 floatx80_infinity_high,
3992                                 floatx80_infinity_low);
3993         }
3994         if ( zExp <= 0 ) {
3995             isTiny =
3996                    (status->float_detect_tininess
3997                     == float_tininess_before_rounding)
3998                 || ( zExp < 0 )
3999                 || ! increment
4000                 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
4001             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4002             zExp = 0;
4003             if (isTiny && zSig1) {
4004                 float_raise(float_flag_underflow, status);
4005             }
4006             if (zSig1) {
4007                 status->float_exception_flags |= float_flag_inexact;
4008             }
4009             switch (roundingMode) {
4010             case float_round_nearest_even:
4011             case float_round_ties_away:
4012                 increment = ((int64_t)zSig1 < 0);
4013                 break;
4014             case float_round_to_zero:
4015                 increment = 0;
4016                 break;
4017             case float_round_up:
4018                 increment = !zSign && zSig1;
4019                 break;
4020             case float_round_down:
4021                 increment = zSign && zSig1;
4022                 break;
4023             default:
4024                 abort();
4025             }
4026             if ( increment ) {
4027                 ++zSig0;
4028                 zSig0 &=
4029                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4031             }
4032             return packFloatx80( zSign, zExp, zSig0 );
4033         }
4034     }
4035     if (zSig1) {
4036         status->float_exception_flags |= float_flag_inexact;
4037     }
4038     if ( increment ) {
4039         ++zSig0;
4040         if ( zSig0 == 0 ) {
4041             ++zExp;
4042             zSig0 = UINT64_C(0x8000000000000000);
4043         }
4044         else {
4045             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4046         }
4047     }
4048     else {
4049         if ( zSig0 == 0 ) zExp = 0;
4050     }
4051     return packFloatx80( zSign, zExp, zSig0 );
4052 
4053 }
4054 
4055 /*----------------------------------------------------------------------------
4056 | Takes an abstract floating-point value having sign `zSign', exponent
4057 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4058 | and returns the proper extended double-precision floating-point value
4059 | corresponding to the abstract input.  This routine is just like
4060 | `roundAndPackFloatx80' except that the input significand does not have to be
4061 | normalized.
4062 *----------------------------------------------------------------------------*/
4063 
4064 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4065                                        flag zSign, int32_t zExp,
4066                                        uint64_t zSig0, uint64_t zSig1,
4067                                        float_status *status)
4068 {
4069     int8_t shiftCount;
4070 
4071     if ( zSig0 == 0 ) {
4072         zSig0 = zSig1;
4073         zSig1 = 0;
4074         zExp -= 64;
4075     }
4076     shiftCount = clz64(zSig0);
4077     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4078     zExp -= shiftCount;
4079     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4080                                 zSig0, zSig1, status);
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Returns the least-significant 64 fraction bits of the quadruple-precision
4086 | floating-point value `a'.
4087 *----------------------------------------------------------------------------*/
4088 
4089 static inline uint64_t extractFloat128Frac1( float128 a )
4090 {
4091 
4092     return a.low;
4093 
4094 }
4095 
4096 /*----------------------------------------------------------------------------
4097 | Returns the most-significant 48 fraction bits of the quadruple-precision
4098 | floating-point value `a'.
4099 *----------------------------------------------------------------------------*/
4100 
4101 static inline uint64_t extractFloat128Frac0( float128 a )
4102 {
4103 
4104     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4105 
4106 }
4107 
4108 /*----------------------------------------------------------------------------
4109 | Returns the exponent bits of the quadruple-precision floating-point value
4110 | `a'.
4111 *----------------------------------------------------------------------------*/
4112 
4113 static inline int32_t extractFloat128Exp( float128 a )
4114 {
4115 
4116     return ( a.high>>48 ) & 0x7FFF;
4117 
4118 }
4119 
4120 /*----------------------------------------------------------------------------
4121 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4122 *----------------------------------------------------------------------------*/
4123 
4124 static inline flag extractFloat128Sign( float128 a )
4125 {
4126 
4127     return a.high>>63;
4128 
4129 }
4130 
4131 /*----------------------------------------------------------------------------
4132 | Normalizes the subnormal quadruple-precision floating-point value
4133 | represented by the denormalized significand formed by the concatenation of
4134 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4135 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4136 | significand are stored at the location pointed to by `zSig0Ptr', and the
4137 | least significant 64 bits of the normalized significand are stored at the
4138 | location pointed to by `zSig1Ptr'.
4139 *----------------------------------------------------------------------------*/
4140 
4141 static void
4142  normalizeFloat128Subnormal(
4143      uint64_t aSig0,
4144      uint64_t aSig1,
4145      int32_t *zExpPtr,
4146      uint64_t *zSig0Ptr,
4147      uint64_t *zSig1Ptr
4148  )
4149 {
4150     int8_t shiftCount;
4151 
4152     if ( aSig0 == 0 ) {
4153         shiftCount = clz64(aSig1) - 15;
4154         if ( shiftCount < 0 ) {
4155             *zSig0Ptr = aSig1>>( - shiftCount );
4156             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4157         }
4158         else {
4159             *zSig0Ptr = aSig1<<shiftCount;
4160             *zSig1Ptr = 0;
4161         }
4162         *zExpPtr = - shiftCount - 63;
4163     }
4164     else {
4165         shiftCount = clz64(aSig0) - 15;
4166         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4167         *zExpPtr = 1 - shiftCount;
4168     }
4169 
4170 }
4171 
4172 /*----------------------------------------------------------------------------
4173 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4174 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4175 | floating-point value, returning the result.  After being shifted into the
4176 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4177 | added together to form the most significant 32 bits of the result.  This
4178 | means that any integer portion of `zSig0' will be added into the exponent.
4179 | Since a properly normalized significand will have an integer portion equal
4180 | to 1, the `zExp' input should be 1 less than the desired result exponent
4181 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4182 | significand.
4183 *----------------------------------------------------------------------------*/
4184 
4185 static inline float128
4186  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4187 {
4188     float128 z;
4189 
4190     z.low = zSig1;
4191     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4192     return z;
4193 
4194 }
4195 
4196 /*----------------------------------------------------------------------------
4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4198 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4199 | and `zSig2', and returns the proper quadruple-precision floating-point value
4200 | corresponding to the abstract input.  Ordinarily, the abstract value is
4201 | simply rounded and packed into the quadruple-precision format, with the
4202 | inexact exception raised if the abstract input cannot be represented
4203 | exactly.  However, if the abstract value is too large, the overflow and
4204 | inexact exceptions are raised and an infinity or maximal finite value is
4205 | returned.  If the abstract value is too small, the input value is rounded to
4206 | a subnormal number, and the underflow and inexact exceptions are raised if
4207 | the abstract input cannot be represented exactly as a subnormal quadruple-
4208 | precision floating-point number.
4209 |     The input significand must be normalized or smaller.  If the input
4210 | significand is not normalized, `zExp' must be 0; in that case, the result
4211 | returned is a subnormal number, and it must not require rounding.  In the
4212 | usual case that the input significand is normalized, `zExp' must be 1 less
4213 | than the ``true'' floating-point exponent.  The handling of underflow and
4214 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4215 *----------------------------------------------------------------------------*/
4216 
4217 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4218                                      uint64_t zSig0, uint64_t zSig1,
4219                                      uint64_t zSig2, float_status *status)
4220 {
4221     int8_t roundingMode;
4222     flag roundNearestEven, increment, isTiny;
4223 
4224     roundingMode = status->float_rounding_mode;
4225     roundNearestEven = ( roundingMode == float_round_nearest_even );
4226     switch (roundingMode) {
4227     case float_round_nearest_even:
4228     case float_round_ties_away:
4229         increment = ((int64_t)zSig2 < 0);
4230         break;
4231     case float_round_to_zero:
4232         increment = 0;
4233         break;
4234     case float_round_up:
4235         increment = !zSign && zSig2;
4236         break;
4237     case float_round_down:
4238         increment = zSign && zSig2;
4239         break;
4240     case float_round_to_odd:
4241         increment = !(zSig1 & 0x1) && zSig2;
4242         break;
4243     default:
4244         abort();
4245     }
4246     if ( 0x7FFD <= (uint32_t) zExp ) {
4247         if (    ( 0x7FFD < zExp )
4248              || (    ( zExp == 0x7FFD )
4249                   && eq128(
4250                          UINT64_C(0x0001FFFFFFFFFFFF),
4251                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4252                          zSig0,
4253                          zSig1
4254                      )
4255                   && increment
4256                 )
4257            ) {
4258             float_raise(float_flag_overflow | float_flag_inexact, status);
4259             if (    ( roundingMode == float_round_to_zero )
4260                  || ( zSign && ( roundingMode == float_round_up ) )
4261                  || ( ! zSign && ( roundingMode == float_round_down ) )
4262                  || (roundingMode == float_round_to_odd)
4263                ) {
4264                 return
4265                     packFloat128(
4266                         zSign,
4267                         0x7FFE,
4268                         UINT64_C(0x0000FFFFFFFFFFFF),
4269                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4270                     );
4271             }
4272             return packFloat128( zSign, 0x7FFF, 0, 0 );
4273         }
4274         if ( zExp < 0 ) {
4275             if (status->flush_to_zero) {
4276                 float_raise(float_flag_output_denormal, status);
4277                 return packFloat128(zSign, 0, 0, 0);
4278             }
4279             isTiny =
4280                    (status->float_detect_tininess
4281                     == float_tininess_before_rounding)
4282                 || ( zExp < -1 )
4283                 || ! increment
4284                 || lt128(
4285                        zSig0,
4286                        zSig1,
4287                        UINT64_C(0x0001FFFFFFFFFFFF),
4288                        UINT64_C(0xFFFFFFFFFFFFFFFF)
4289                    );
4290             shift128ExtraRightJamming(
4291                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4292             zExp = 0;
4293             if (isTiny && zSig2) {
4294                 float_raise(float_flag_underflow, status);
4295             }
4296             switch (roundingMode) {
4297             case float_round_nearest_even:
4298             case float_round_ties_away:
4299                 increment = ((int64_t)zSig2 < 0);
4300                 break;
4301             case float_round_to_zero:
4302                 increment = 0;
4303                 break;
4304             case float_round_up:
4305                 increment = !zSign && zSig2;
4306                 break;
4307             case float_round_down:
4308                 increment = zSign && zSig2;
4309                 break;
4310             case float_round_to_odd:
4311                 increment = !(zSig1 & 0x1) && zSig2;
4312                 break;
4313             default:
4314                 abort();
4315             }
4316         }
4317     }
4318     if (zSig2) {
4319         status->float_exception_flags |= float_flag_inexact;
4320     }
4321     if ( increment ) {
4322         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4323         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4324     }
4325     else {
4326         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4327     }
4328     return packFloat128( zSign, zExp, zSig0, zSig1 );
4329 
4330 }
4331 
4332 /*----------------------------------------------------------------------------
4333 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4334 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4335 | returns the proper quadruple-precision floating-point value corresponding
4336 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4337 | except that the input significand has fewer bits and does not have to be
4338 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4339 | point exponent.
4340 *----------------------------------------------------------------------------*/
4341 
4342 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4343                                               uint64_t zSig0, uint64_t zSig1,
4344                                               float_status *status)
4345 {
4346     int8_t shiftCount;
4347     uint64_t zSig2;
4348 
4349     if ( zSig0 == 0 ) {
4350         zSig0 = zSig1;
4351         zSig1 = 0;
4352         zExp -= 64;
4353     }
4354     shiftCount = clz64(zSig0) - 15;
4355     if ( 0 <= shiftCount ) {
4356         zSig2 = 0;
4357         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4358     }
4359     else {
4360         shift128ExtraRightJamming(
4361             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4362     }
4363     zExp -= shiftCount;
4364     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4365 
4366 }
4367 
4368 
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of converting the 32-bit two's complement integer `a'
4371 | to the extended double-precision floating-point format.  The conversion
4372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4373 | Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375 
4376 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4377 {
4378     flag zSign;
4379     uint32_t absA;
4380     int8_t shiftCount;
4381     uint64_t zSig;
4382 
4383     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4384     zSign = ( a < 0 );
4385     absA = zSign ? - a : a;
4386     shiftCount = clz32(absA) + 32;
4387     zSig = absA;
4388     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns the result of converting the 32-bit two's complement integer `a' to
4394 | the quadruple-precision floating-point format.  The conversion is performed
4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4397 
4398 float128 int32_to_float128(int32_t a, float_status *status)
4399 {
4400     flag zSign;
4401     uint32_t absA;
4402     int8_t shiftCount;
4403     uint64_t zSig0;
4404 
4405     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4406     zSign = ( a < 0 );
4407     absA = zSign ? - a : a;
4408     shiftCount = clz32(absA) + 17;
4409     zSig0 = absA;
4410     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4411 
4412 }
4413 
4414 /*----------------------------------------------------------------------------
4415 | Returns the result of converting the 64-bit two's complement integer `a'
4416 | to the extended double-precision floating-point format.  The conversion
4417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4418 | Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420 
4421 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4422 {
4423     flag zSign;
4424     uint64_t absA;
4425     int8_t shiftCount;
4426 
4427     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4428     zSign = ( a < 0 );
4429     absA = zSign ? - a : a;
4430     shiftCount = clz64(absA);
4431     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4432 
4433 }
4434 
4435 /*----------------------------------------------------------------------------
4436 | Returns the result of converting the 64-bit two's complement integer `a' to
4437 | the quadruple-precision floating-point format.  The conversion is performed
4438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4439 *----------------------------------------------------------------------------*/
4440 
4441 float128 int64_to_float128(int64_t a, float_status *status)
4442 {
4443     flag zSign;
4444     uint64_t absA;
4445     int8_t shiftCount;
4446     int32_t zExp;
4447     uint64_t zSig0, zSig1;
4448 
4449     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4450     zSign = ( a < 0 );
4451     absA = zSign ? - a : a;
4452     shiftCount = clz64(absA) + 49;
4453     zExp = 0x406E - shiftCount;
4454     if ( 64 <= shiftCount ) {
4455         zSig1 = 0;
4456         zSig0 = absA;
4457         shiftCount -= 64;
4458     }
4459     else {
4460         zSig1 = absA;
4461         zSig0 = 0;
4462     }
4463     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4464     return packFloat128( zSign, zExp, zSig0, zSig1 );
4465 
4466 }
4467 
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the 64-bit unsigned integer `a'
4470 | to the quadruple-precision floating-point format.  The conversion is performed
4471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4472 *----------------------------------------------------------------------------*/
4473 
4474 float128 uint64_to_float128(uint64_t a, float_status *status)
4475 {
4476     if (a == 0) {
4477         return float128_zero;
4478     }
4479     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4480 }
4481 
4482 /*----------------------------------------------------------------------------
4483 | Returns the result of converting the single-precision floating-point value
4484 | `a' to the extended double-precision floating-point format.  The conversion
4485 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486 | Arithmetic.
4487 *----------------------------------------------------------------------------*/
4488 
4489 floatx80 float32_to_floatx80(float32 a, float_status *status)
4490 {
4491     flag aSign;
4492     int aExp;
4493     uint32_t aSig;
4494 
4495     a = float32_squash_input_denormal(a, status);
4496     aSig = extractFloat32Frac( a );
4497     aExp = extractFloat32Exp( a );
4498     aSign = extractFloat32Sign( a );
4499     if ( aExp == 0xFF ) {
4500         if (aSig) {
4501             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4502         }
4503         return packFloatx80(aSign,
4504                             floatx80_infinity_high,
4505                             floatx80_infinity_low);
4506     }
4507     if ( aExp == 0 ) {
4508         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4509         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4510     }
4511     aSig |= 0x00800000;
4512     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4513 
4514 }
4515 
4516 /*----------------------------------------------------------------------------
4517 | Returns the result of converting the single-precision floating-point value
4518 | `a' to the double-precision floating-point format.  The conversion is
4519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4520 | Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522 
4523 float128 float32_to_float128(float32 a, float_status *status)
4524 {
4525     flag aSign;
4526     int aExp;
4527     uint32_t aSig;
4528 
4529     a = float32_squash_input_denormal(a, status);
4530     aSig = extractFloat32Frac( a );
4531     aExp = extractFloat32Exp( a );
4532     aSign = extractFloat32Sign( a );
4533     if ( aExp == 0xFF ) {
4534         if (aSig) {
4535             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4536         }
4537         return packFloat128( aSign, 0x7FFF, 0, 0 );
4538     }
4539     if ( aExp == 0 ) {
4540         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4541         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4542         --aExp;
4543     }
4544     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4545 
4546 }
4547 
4548 /*----------------------------------------------------------------------------
4549 | Returns the remainder of the single-precision floating-point value `a'
4550 | with respect to the corresponding value `b'.  The operation is performed
4551 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4552 *----------------------------------------------------------------------------*/
4553 
4554 float32 float32_rem(float32 a, float32 b, float_status *status)
4555 {
4556     flag aSign, zSign;
4557     int aExp, bExp, expDiff;
4558     uint32_t aSig, bSig;
4559     uint32_t q;
4560     uint64_t aSig64, bSig64, q64;
4561     uint32_t alternateASig;
4562     int32_t sigMean;
4563     a = float32_squash_input_denormal(a, status);
4564     b = float32_squash_input_denormal(b, status);
4565 
4566     aSig = extractFloat32Frac( a );
4567     aExp = extractFloat32Exp( a );
4568     aSign = extractFloat32Sign( a );
4569     bSig = extractFloat32Frac( b );
4570     bExp = extractFloat32Exp( b );
4571     if ( aExp == 0xFF ) {
4572         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4573             return propagateFloat32NaN(a, b, status);
4574         }
4575         float_raise(float_flag_invalid, status);
4576         return float32_default_nan(status);
4577     }
4578     if ( bExp == 0xFF ) {
4579         if (bSig) {
4580             return propagateFloat32NaN(a, b, status);
4581         }
4582         return a;
4583     }
4584     if ( bExp == 0 ) {
4585         if ( bSig == 0 ) {
4586             float_raise(float_flag_invalid, status);
4587             return float32_default_nan(status);
4588         }
4589         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4590     }
4591     if ( aExp == 0 ) {
4592         if ( aSig == 0 ) return a;
4593         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4594     }
4595     expDiff = aExp - bExp;
4596     aSig |= 0x00800000;
4597     bSig |= 0x00800000;
4598     if ( expDiff < 32 ) {
4599         aSig <<= 8;
4600         bSig <<= 8;
4601         if ( expDiff < 0 ) {
4602             if ( expDiff < -1 ) return a;
4603             aSig >>= 1;
4604         }
4605         q = ( bSig <= aSig );
4606         if ( q ) aSig -= bSig;
4607         if ( 0 < expDiff ) {
4608             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4609             q >>= 32 - expDiff;
4610             bSig >>= 2;
4611             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4612         }
4613         else {
4614             aSig >>= 2;
4615             bSig >>= 2;
4616         }
4617     }
4618     else {
4619         if ( bSig <= aSig ) aSig -= bSig;
4620         aSig64 = ( (uint64_t) aSig )<<40;
4621         bSig64 = ( (uint64_t) bSig )<<40;
4622         expDiff -= 64;
4623         while ( 0 < expDiff ) {
4624             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4625             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4626             aSig64 = - ( ( bSig * q64 )<<38 );
4627             expDiff -= 62;
4628         }
4629         expDiff += 64;
4630         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4631         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4632         q = q64>>( 64 - expDiff );
4633         bSig <<= 6;
4634         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4635     }
4636     do {
4637         alternateASig = aSig;
4638         ++q;
4639         aSig -= bSig;
4640     } while ( 0 <= (int32_t) aSig );
4641     sigMean = aSig + alternateASig;
4642     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4643         aSig = alternateASig;
4644     }
4645     zSign = ( (int32_t) aSig < 0 );
4646     if ( zSign ) aSig = - aSig;
4647     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4648 }
4649 
4650 
4651 
4652 /*----------------------------------------------------------------------------
4653 | Returns the binary exponential of the single-precision floating-point value
4654 | `a'. The operation is performed according to the IEC/IEEE Standard for
4655 | Binary Floating-Point Arithmetic.
4656 |
4657 | Uses the following identities:
4658 |
4659 | 1. -------------------------------------------------------------------------
4660 |      x    x*ln(2)
4661 |     2  = e
4662 |
4663 | 2. -------------------------------------------------------------------------
4664 |                      2     3     4     5           n
4665 |      x        x     x     x     x     x           x
4666 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4667 |               1!    2!    3!    4!    5!          n!
4668 *----------------------------------------------------------------------------*/
4669 
4670 static const float64 float32_exp2_coefficients[15] =
4671 {
4672     const_float64( 0x3ff0000000000000ll ), /*  1 */
4673     const_float64( 0x3fe0000000000000ll ), /*  2 */
4674     const_float64( 0x3fc5555555555555ll ), /*  3 */
4675     const_float64( 0x3fa5555555555555ll ), /*  4 */
4676     const_float64( 0x3f81111111111111ll ), /*  5 */
4677     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4678     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4679     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4680     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4681     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4682     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4683     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4684     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4685     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4686     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4687 };
4688 
4689 float32 float32_exp2(float32 a, float_status *status)
4690 {
4691     flag aSign;
4692     int aExp;
4693     uint32_t aSig;
4694     float64 r, x, xn;
4695     int i;
4696     a = float32_squash_input_denormal(a, status);
4697 
4698     aSig = extractFloat32Frac( a );
4699     aExp = extractFloat32Exp( a );
4700     aSign = extractFloat32Sign( a );
4701 
4702     if ( aExp == 0xFF) {
4703         if (aSig) {
4704             return propagateFloat32NaN(a, float32_zero, status);
4705         }
4706         return (aSign) ? float32_zero : a;
4707     }
4708     if (aExp == 0) {
4709         if (aSig == 0) return float32_one;
4710     }
4711 
4712     float_raise(float_flag_inexact, status);
4713 
4714     /* ******************************* */
4715     /* using float64 for approximation */
4716     /* ******************************* */
4717     x = float32_to_float64(a, status);
4718     x = float64_mul(x, float64_ln2, status);
4719 
4720     xn = x;
4721     r = float64_one;
4722     for (i = 0 ; i < 15 ; i++) {
4723         float64 f;
4724 
4725         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4726         r = float64_add(r, f, status);
4727 
4728         xn = float64_mul(xn, x, status);
4729     }
4730 
4731     return float64_to_float32(r, status);
4732 }
4733 
4734 /*----------------------------------------------------------------------------
4735 | Returns the binary log of the single-precision floating-point value `a'.
4736 | The operation is performed according to the IEC/IEEE Standard for Binary
4737 | Floating-Point Arithmetic.
4738 *----------------------------------------------------------------------------*/
4739 float32 float32_log2(float32 a, float_status *status)
4740 {
4741     flag aSign, zSign;
4742     int aExp;
4743     uint32_t aSig, zSig, i;
4744 
4745     a = float32_squash_input_denormal(a, status);
4746     aSig = extractFloat32Frac( a );
4747     aExp = extractFloat32Exp( a );
4748     aSign = extractFloat32Sign( a );
4749 
4750     if ( aExp == 0 ) {
4751         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4752         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4753     }
4754     if ( aSign ) {
4755         float_raise(float_flag_invalid, status);
4756         return float32_default_nan(status);
4757     }
4758     if ( aExp == 0xFF ) {
4759         if (aSig) {
4760             return propagateFloat32NaN(a, float32_zero, status);
4761         }
4762         return a;
4763     }
4764 
4765     aExp -= 0x7F;
4766     aSig |= 0x00800000;
4767     zSign = aExp < 0;
4768     zSig = aExp << 23;
4769 
4770     for (i = 1 << 22; i > 0; i >>= 1) {
4771         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4772         if ( aSig & 0x01000000 ) {
4773             aSig >>= 1;
4774             zSig |= i;
4775         }
4776     }
4777 
4778     if ( zSign )
4779         zSig = -zSig;
4780 
4781     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4782 }
4783 
4784 /*----------------------------------------------------------------------------
4785 | Returns 1 if the single-precision floating-point value `a' is equal to
4786 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4787 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4788 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4789 *----------------------------------------------------------------------------*/
4790 
4791 int float32_eq(float32 a, float32 b, float_status *status)
4792 {
4793     uint32_t av, bv;
4794     a = float32_squash_input_denormal(a, status);
4795     b = float32_squash_input_denormal(b, status);
4796 
4797     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4798          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4799        ) {
4800         float_raise(float_flag_invalid, status);
4801         return 0;
4802     }
4803     av = float32_val(a);
4804     bv = float32_val(b);
4805     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4806 }
4807 
4808 /*----------------------------------------------------------------------------
4809 | Returns 1 if the single-precision floating-point value `a' is less than
4810 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4811 | exception is raised if either operand is a NaN.  The comparison is performed
4812 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4813 *----------------------------------------------------------------------------*/
4814 
4815 int float32_le(float32 a, float32 b, float_status *status)
4816 {
4817     flag aSign, bSign;
4818     uint32_t av, bv;
4819     a = float32_squash_input_denormal(a, status);
4820     b = float32_squash_input_denormal(b, status);
4821 
4822     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4823          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4824        ) {
4825         float_raise(float_flag_invalid, status);
4826         return 0;
4827     }
4828     aSign = extractFloat32Sign( a );
4829     bSign = extractFloat32Sign( b );
4830     av = float32_val(a);
4831     bv = float32_val(b);
4832     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4833     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4834 
4835 }
4836 
4837 /*----------------------------------------------------------------------------
4838 | Returns 1 if the single-precision floating-point value `a' is less than
4839 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4840 | raised if either operand is a NaN.  The comparison is performed according
4841 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4842 *----------------------------------------------------------------------------*/
4843 
4844 int float32_lt(float32 a, float32 b, float_status *status)
4845 {
4846     flag aSign, bSign;
4847     uint32_t av, bv;
4848     a = float32_squash_input_denormal(a, status);
4849     b = float32_squash_input_denormal(b, status);
4850 
4851     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4852          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4853        ) {
4854         float_raise(float_flag_invalid, status);
4855         return 0;
4856     }
4857     aSign = extractFloat32Sign( a );
4858     bSign = extractFloat32Sign( b );
4859     av = float32_val(a);
4860     bv = float32_val(b);
4861     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4862     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4863 
4864 }
4865 
4866 /*----------------------------------------------------------------------------
4867 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4868 | be compared, and 0 otherwise.  The invalid exception is raised if either
4869 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4870 | Standard for Binary Floating-Point Arithmetic.
4871 *----------------------------------------------------------------------------*/
4872 
4873 int float32_unordered(float32 a, float32 b, float_status *status)
4874 {
4875     a = float32_squash_input_denormal(a, status);
4876     b = float32_squash_input_denormal(b, status);
4877 
4878     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4879          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4880        ) {
4881         float_raise(float_flag_invalid, status);
4882         return 1;
4883     }
4884     return 0;
4885 }
4886 
4887 /*----------------------------------------------------------------------------
4888 | Returns 1 if the single-precision floating-point value `a' is equal to
4889 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4890 | exception.  The comparison is performed according to the IEC/IEEE Standard
4891 | for Binary Floating-Point Arithmetic.
4892 *----------------------------------------------------------------------------*/
4893 
4894 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4895 {
4896     a = float32_squash_input_denormal(a, status);
4897     b = float32_squash_input_denormal(b, status);
4898 
4899     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4900          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4901        ) {
4902         if (float32_is_signaling_nan(a, status)
4903          || float32_is_signaling_nan(b, status)) {
4904             float_raise(float_flag_invalid, status);
4905         }
4906         return 0;
4907     }
4908     return ( float32_val(a) == float32_val(b) ) ||
4909             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4910 }
4911 
4912 /*----------------------------------------------------------------------------
4913 | Returns 1 if the single-precision floating-point value `a' is less than or
4914 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4915 | cause an exception.  Otherwise, the comparison is performed according to the
4916 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4917 *----------------------------------------------------------------------------*/
4918 
4919 int float32_le_quiet(float32 a, float32 b, float_status *status)
4920 {
4921     flag aSign, bSign;
4922     uint32_t av, bv;
4923     a = float32_squash_input_denormal(a, status);
4924     b = float32_squash_input_denormal(b, status);
4925 
4926     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4927          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4928        ) {
4929         if (float32_is_signaling_nan(a, status)
4930          || float32_is_signaling_nan(b, status)) {
4931             float_raise(float_flag_invalid, status);
4932         }
4933         return 0;
4934     }
4935     aSign = extractFloat32Sign( a );
4936     bSign = extractFloat32Sign( b );
4937     av = float32_val(a);
4938     bv = float32_val(b);
4939     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4940     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4941 
4942 }
4943 
4944 /*----------------------------------------------------------------------------
4945 | Returns 1 if the single-precision floating-point value `a' is less than
4946 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4947 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4948 | Standard for Binary Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4950 
4951 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4952 {
4953     flag aSign, bSign;
4954     uint32_t av, bv;
4955     a = float32_squash_input_denormal(a, status);
4956     b = float32_squash_input_denormal(b, status);
4957 
4958     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4959          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4960        ) {
4961         if (float32_is_signaling_nan(a, status)
4962          || float32_is_signaling_nan(b, status)) {
4963             float_raise(float_flag_invalid, status);
4964         }
4965         return 0;
4966     }
4967     aSign = extractFloat32Sign( a );
4968     bSign = extractFloat32Sign( b );
4969     av = float32_val(a);
4970     bv = float32_val(b);
4971     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4972     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4973 
4974 }
4975 
4976 /*----------------------------------------------------------------------------
4977 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4978 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4979 | comparison is performed according to the IEC/IEEE Standard for Binary
4980 | Floating-Point Arithmetic.
4981 *----------------------------------------------------------------------------*/
4982 
4983 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4984 {
4985     a = float32_squash_input_denormal(a, status);
4986     b = float32_squash_input_denormal(b, status);
4987 
4988     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4989          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4990        ) {
4991         if (float32_is_signaling_nan(a, status)
4992          || float32_is_signaling_nan(b, status)) {
4993             float_raise(float_flag_invalid, status);
4994         }
4995         return 1;
4996     }
4997     return 0;
4998 }
4999 
5000 /*----------------------------------------------------------------------------
5001 | Returns the result of converting the double-precision floating-point value
5002 | `a' to the extended double-precision floating-point format.  The conversion
5003 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5004 | Arithmetic.
5005 *----------------------------------------------------------------------------*/
5006 
5007 floatx80 float64_to_floatx80(float64 a, float_status *status)
5008 {
5009     flag aSign;
5010     int aExp;
5011     uint64_t aSig;
5012 
5013     a = float64_squash_input_denormal(a, status);
5014     aSig = extractFloat64Frac( a );
5015     aExp = extractFloat64Exp( a );
5016     aSign = extractFloat64Sign( a );
5017     if ( aExp == 0x7FF ) {
5018         if (aSig) {
5019             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5020         }
5021         return packFloatx80(aSign,
5022                             floatx80_infinity_high,
5023                             floatx80_infinity_low);
5024     }
5025     if ( aExp == 0 ) {
5026         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5027         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5028     }
5029     return
5030         packFloatx80(
5031             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5032 
5033 }
5034 
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of converting the double-precision floating-point value
5037 | `a' to the quadruple-precision floating-point format.  The conversion is
5038 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5039 | Arithmetic.
5040 *----------------------------------------------------------------------------*/
5041 
5042 float128 float64_to_float128(float64 a, float_status *status)
5043 {
5044     flag aSign;
5045     int aExp;
5046     uint64_t aSig, zSig0, zSig1;
5047 
5048     a = float64_squash_input_denormal(a, status);
5049     aSig = extractFloat64Frac( a );
5050     aExp = extractFloat64Exp( a );
5051     aSign = extractFloat64Sign( a );
5052     if ( aExp == 0x7FF ) {
5053         if (aSig) {
5054             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5055         }
5056         return packFloat128( aSign, 0x7FFF, 0, 0 );
5057     }
5058     if ( aExp == 0 ) {
5059         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5060         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5061         --aExp;
5062     }
5063     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5064     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5065 
5066 }
5067 
5068 
5069 /*----------------------------------------------------------------------------
5070 | Returns the remainder of the double-precision floating-point value `a'
5071 | with respect to the corresponding value `b'.  The operation is performed
5072 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5073 *----------------------------------------------------------------------------*/
5074 
5075 float64 float64_rem(float64 a, float64 b, float_status *status)
5076 {
5077     flag aSign, zSign;
5078     int aExp, bExp, expDiff;
5079     uint64_t aSig, bSig;
5080     uint64_t q, alternateASig;
5081     int64_t sigMean;
5082 
5083     a = float64_squash_input_denormal(a, status);
5084     b = float64_squash_input_denormal(b, status);
5085     aSig = extractFloat64Frac( a );
5086     aExp = extractFloat64Exp( a );
5087     aSign = extractFloat64Sign( a );
5088     bSig = extractFloat64Frac( b );
5089     bExp = extractFloat64Exp( b );
5090     if ( aExp == 0x7FF ) {
5091         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5092             return propagateFloat64NaN(a, b, status);
5093         }
5094         float_raise(float_flag_invalid, status);
5095         return float64_default_nan(status);
5096     }
5097     if ( bExp == 0x7FF ) {
5098         if (bSig) {
5099             return propagateFloat64NaN(a, b, status);
5100         }
5101         return a;
5102     }
5103     if ( bExp == 0 ) {
5104         if ( bSig == 0 ) {
5105             float_raise(float_flag_invalid, status);
5106             return float64_default_nan(status);
5107         }
5108         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5109     }
5110     if ( aExp == 0 ) {
5111         if ( aSig == 0 ) return a;
5112         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5113     }
5114     expDiff = aExp - bExp;
5115     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5116     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5117     if ( expDiff < 0 ) {
5118         if ( expDiff < -1 ) return a;
5119         aSig >>= 1;
5120     }
5121     q = ( bSig <= aSig );
5122     if ( q ) aSig -= bSig;
5123     expDiff -= 64;
5124     while ( 0 < expDiff ) {
5125         q = estimateDiv128To64( aSig, 0, bSig );
5126         q = ( 2 < q ) ? q - 2 : 0;
5127         aSig = - ( ( bSig>>2 ) * q );
5128         expDiff -= 62;
5129     }
5130     expDiff += 64;
5131     if ( 0 < expDiff ) {
5132         q = estimateDiv128To64( aSig, 0, bSig );
5133         q = ( 2 < q ) ? q - 2 : 0;
5134         q >>= 64 - expDiff;
5135         bSig >>= 2;
5136         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5137     }
5138     else {
5139         aSig >>= 2;
5140         bSig >>= 2;
5141     }
5142     do {
5143         alternateASig = aSig;
5144         ++q;
5145         aSig -= bSig;
5146     } while ( 0 <= (int64_t) aSig );
5147     sigMean = aSig + alternateASig;
5148     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5149         aSig = alternateASig;
5150     }
5151     zSign = ( (int64_t) aSig < 0 );
5152     if ( zSign ) aSig = - aSig;
5153     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5154 
5155 }
5156 
5157 /*----------------------------------------------------------------------------
5158 | Returns the binary log of the double-precision floating-point value `a'.
5159 | The operation is performed according to the IEC/IEEE Standard for Binary
5160 | Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5162 float64 float64_log2(float64 a, float_status *status)
5163 {
5164     flag aSign, zSign;
5165     int aExp;
5166     uint64_t aSig, aSig0, aSig1, zSig, i;
5167     a = float64_squash_input_denormal(a, status);
5168 
5169     aSig = extractFloat64Frac( a );
5170     aExp = extractFloat64Exp( a );
5171     aSign = extractFloat64Sign( a );
5172 
5173     if ( aExp == 0 ) {
5174         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5175         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5176     }
5177     if ( aSign ) {
5178         float_raise(float_flag_invalid, status);
5179         return float64_default_nan(status);
5180     }
5181     if ( aExp == 0x7FF ) {
5182         if (aSig) {
5183             return propagateFloat64NaN(a, float64_zero, status);
5184         }
5185         return a;
5186     }
5187 
5188     aExp -= 0x3FF;
5189     aSig |= UINT64_C(0x0010000000000000);
5190     zSign = aExp < 0;
5191     zSig = (uint64_t)aExp << 52;
5192     for (i = 1LL << 51; i > 0; i >>= 1) {
5193         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5194         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5195         if ( aSig & UINT64_C(0x0020000000000000) ) {
5196             aSig >>= 1;
5197             zSig |= i;
5198         }
5199     }
5200 
5201     if ( zSign )
5202         zSig = -zSig;
5203     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5204 }
5205 
5206 /*----------------------------------------------------------------------------
5207 | Returns 1 if the double-precision floating-point value `a' is equal to the
5208 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5209 | if either operand is a NaN.  Otherwise, the comparison is performed
5210 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5211 *----------------------------------------------------------------------------*/
5212 
5213 int float64_eq(float64 a, float64 b, float_status *status)
5214 {
5215     uint64_t av, bv;
5216     a = float64_squash_input_denormal(a, status);
5217     b = float64_squash_input_denormal(b, status);
5218 
5219     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5220          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5221        ) {
5222         float_raise(float_flag_invalid, status);
5223         return 0;
5224     }
5225     av = float64_val(a);
5226     bv = float64_val(b);
5227     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5228 
5229 }
5230 
5231 /*----------------------------------------------------------------------------
5232 | Returns 1 if the double-precision floating-point value `a' is less than or
5233 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5234 | exception is raised if either operand is a NaN.  The comparison is performed
5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 int float64_le(float64 a, float64 b, float_status *status)
5239 {
5240     flag aSign, bSign;
5241     uint64_t av, bv;
5242     a = float64_squash_input_denormal(a, status);
5243     b = float64_squash_input_denormal(b, status);
5244 
5245     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5246          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5247        ) {
5248         float_raise(float_flag_invalid, status);
5249         return 0;
5250     }
5251     aSign = extractFloat64Sign( a );
5252     bSign = extractFloat64Sign( b );
5253     av = float64_val(a);
5254     bv = float64_val(b);
5255     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5256     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5257 
5258 }
5259 
5260 /*----------------------------------------------------------------------------
5261 | Returns 1 if the double-precision floating-point value `a' is less than
5262 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5263 | raised if either operand is a NaN.  The comparison is performed according
5264 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5265 *----------------------------------------------------------------------------*/
5266 
5267 int float64_lt(float64 a, float64 b, float_status *status)
5268 {
5269     flag aSign, bSign;
5270     uint64_t av, bv;
5271 
5272     a = float64_squash_input_denormal(a, status);
5273     b = float64_squash_input_denormal(b, status);
5274     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5275          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5276        ) {
5277         float_raise(float_flag_invalid, status);
5278         return 0;
5279     }
5280     aSign = extractFloat64Sign( a );
5281     bSign = extractFloat64Sign( b );
5282     av = float64_val(a);
5283     bv = float64_val(b);
5284     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5285     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5286 
5287 }
5288 
5289 /*----------------------------------------------------------------------------
5290 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5291 | be compared, and 0 otherwise.  The invalid exception is raised if either
5292 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5293 | Standard for Binary Floating-Point Arithmetic.
5294 *----------------------------------------------------------------------------*/
5295 
5296 int float64_unordered(float64 a, float64 b, float_status *status)
5297 {
5298     a = float64_squash_input_denormal(a, status);
5299     b = float64_squash_input_denormal(b, status);
5300 
5301     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5302          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5303        ) {
5304         float_raise(float_flag_invalid, status);
5305         return 1;
5306     }
5307     return 0;
5308 }
5309 
5310 /*----------------------------------------------------------------------------
5311 | Returns 1 if the double-precision floating-point value `a' is equal to the
5312 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5313 | exception.The comparison is performed according to the IEC/IEEE Standard
5314 | for Binary Floating-Point Arithmetic.
5315 *----------------------------------------------------------------------------*/
5316 
5317 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5318 {
5319     uint64_t av, bv;
5320     a = float64_squash_input_denormal(a, status);
5321     b = float64_squash_input_denormal(b, status);
5322 
5323     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5324          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5325        ) {
5326         if (float64_is_signaling_nan(a, status)
5327          || float64_is_signaling_nan(b, status)) {
5328             float_raise(float_flag_invalid, status);
5329         }
5330         return 0;
5331     }
5332     av = float64_val(a);
5333     bv = float64_val(b);
5334     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5335 
5336 }
5337 
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the double-precision floating-point value `a' is less than or
5340 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5341 | cause an exception.  Otherwise, the comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 
5345 int float64_le_quiet(float64 a, float64 b, float_status *status)
5346 {
5347     flag aSign, bSign;
5348     uint64_t av, bv;
5349     a = float64_squash_input_denormal(a, status);
5350     b = float64_squash_input_denormal(b, status);
5351 
5352     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5353          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5354        ) {
5355         if (float64_is_signaling_nan(a, status)
5356          || float64_is_signaling_nan(b, status)) {
5357             float_raise(float_flag_invalid, status);
5358         }
5359         return 0;
5360     }
5361     aSign = extractFloat64Sign( a );
5362     bSign = extractFloat64Sign( b );
5363     av = float64_val(a);
5364     bv = float64_val(b);
5365     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5366     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5367 
5368 }
5369 
5370 /*----------------------------------------------------------------------------
5371 | Returns 1 if the double-precision floating-point value `a' is less than
5372 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5373 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5374 | Standard for Binary Floating-Point Arithmetic.
5375 *----------------------------------------------------------------------------*/
5376 
5377 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5378 {
5379     flag aSign, bSign;
5380     uint64_t av, bv;
5381     a = float64_squash_input_denormal(a, status);
5382     b = float64_squash_input_denormal(b, status);
5383 
5384     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5385          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5386        ) {
5387         if (float64_is_signaling_nan(a, status)
5388          || float64_is_signaling_nan(b, status)) {
5389             float_raise(float_flag_invalid, status);
5390         }
5391         return 0;
5392     }
5393     aSign = extractFloat64Sign( a );
5394     bSign = extractFloat64Sign( b );
5395     av = float64_val(a);
5396     bv = float64_val(b);
5397     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5398     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5399 
5400 }
5401 
5402 /*----------------------------------------------------------------------------
5403 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5404 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5405 | comparison is performed according to the IEC/IEEE Standard for Binary
5406 | Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408 
5409 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5410 {
5411     a = float64_squash_input_denormal(a, status);
5412     b = float64_squash_input_denormal(b, status);
5413 
5414     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5415          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5416        ) {
5417         if (float64_is_signaling_nan(a, status)
5418          || float64_is_signaling_nan(b, status)) {
5419             float_raise(float_flag_invalid, status);
5420         }
5421         return 1;
5422     }
5423     return 0;
5424 }
5425 
5426 /*----------------------------------------------------------------------------
5427 | Returns the result of converting the extended double-precision floating-
5428 | point value `a' to the 32-bit two's complement integer format.  The
5429 | conversion is performed according to the IEC/IEEE Standard for Binary
5430 | Floating-Point Arithmetic---which means in particular that the conversion
5431 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5432 | largest positive integer is returned.  Otherwise, if the conversion
5433 | overflows, the largest integer with the same sign as `a' is returned.
5434 *----------------------------------------------------------------------------*/
5435 
5436 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5437 {
5438     flag aSign;
5439     int32_t aExp, shiftCount;
5440     uint64_t aSig;
5441 
5442     if (floatx80_invalid_encoding(a)) {
5443         float_raise(float_flag_invalid, status);
5444         return 1 << 31;
5445     }
5446     aSig = extractFloatx80Frac( a );
5447     aExp = extractFloatx80Exp( a );
5448     aSign = extractFloatx80Sign( a );
5449     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5450     shiftCount = 0x4037 - aExp;
5451     if ( shiftCount <= 0 ) shiftCount = 1;
5452     shift64RightJamming( aSig, shiftCount, &aSig );
5453     return roundAndPackInt32(aSign, aSig, status);
5454 
5455 }
5456 
5457 /*----------------------------------------------------------------------------
5458 | Returns the result of converting the extended double-precision floating-
5459 | point value `a' to the 32-bit two's complement integer format.  The
5460 | conversion is performed according to the IEC/IEEE Standard for Binary
5461 | Floating-Point Arithmetic, except that the conversion is always rounded
5462 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5463 | Otherwise, if the conversion overflows, the largest integer with the same
5464 | sign as `a' is returned.
5465 *----------------------------------------------------------------------------*/
5466 
5467 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5468 {
5469     flag aSign;
5470     int32_t aExp, shiftCount;
5471     uint64_t aSig, savedASig;
5472     int32_t z;
5473 
5474     if (floatx80_invalid_encoding(a)) {
5475         float_raise(float_flag_invalid, status);
5476         return 1 << 31;
5477     }
5478     aSig = extractFloatx80Frac( a );
5479     aExp = extractFloatx80Exp( a );
5480     aSign = extractFloatx80Sign( a );
5481     if ( 0x401E < aExp ) {
5482         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5483         goto invalid;
5484     }
5485     else if ( aExp < 0x3FFF ) {
5486         if (aExp || aSig) {
5487             status->float_exception_flags |= float_flag_inexact;
5488         }
5489         return 0;
5490     }
5491     shiftCount = 0x403E - aExp;
5492     savedASig = aSig;
5493     aSig >>= shiftCount;
5494     z = aSig;
5495     if ( aSign ) z = - z;
5496     if ( ( z < 0 ) ^ aSign ) {
5497  invalid:
5498         float_raise(float_flag_invalid, status);
5499         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5500     }
5501     if ( ( aSig<<shiftCount ) != savedASig ) {
5502         status->float_exception_flags |= float_flag_inexact;
5503     }
5504     return z;
5505 
5506 }
5507 
5508 /*----------------------------------------------------------------------------
5509 | Returns the result of converting the extended double-precision floating-
5510 | point value `a' to the 64-bit two's complement integer format.  The
5511 | conversion is performed according to the IEC/IEEE Standard for Binary
5512 | Floating-Point Arithmetic---which means in particular that the conversion
5513 | is rounded according to the current rounding mode.  If `a' is a NaN,
5514 | the largest positive integer is returned.  Otherwise, if the conversion
5515 | overflows, the largest integer with the same sign as `a' is returned.
5516 *----------------------------------------------------------------------------*/
5517 
5518 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5519 {
5520     flag aSign;
5521     int32_t aExp, shiftCount;
5522     uint64_t aSig, aSigExtra;
5523 
5524     if (floatx80_invalid_encoding(a)) {
5525         float_raise(float_flag_invalid, status);
5526         return 1ULL << 63;
5527     }
5528     aSig = extractFloatx80Frac( a );
5529     aExp = extractFloatx80Exp( a );
5530     aSign = extractFloatx80Sign( a );
5531     shiftCount = 0x403E - aExp;
5532     if ( shiftCount <= 0 ) {
5533         if ( shiftCount ) {
5534             float_raise(float_flag_invalid, status);
5535             if (!aSign || floatx80_is_any_nan(a)) {
5536                 return INT64_MAX;
5537             }
5538             return INT64_MIN;
5539         }
5540         aSigExtra = 0;
5541     }
5542     else {
5543         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5544     }
5545     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5546 
5547 }
5548 
5549 /*----------------------------------------------------------------------------
5550 | Returns the result of converting the extended double-precision floating-
5551 | point value `a' to the 64-bit two's complement integer format.  The
5552 | conversion is performed according to the IEC/IEEE Standard for Binary
5553 | Floating-Point Arithmetic, except that the conversion is always rounded
5554 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5555 | Otherwise, if the conversion overflows, the largest integer with the same
5556 | sign as `a' is returned.
5557 *----------------------------------------------------------------------------*/
5558 
5559 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5560 {
5561     flag aSign;
5562     int32_t aExp, shiftCount;
5563     uint64_t aSig;
5564     int64_t z;
5565 
5566     if (floatx80_invalid_encoding(a)) {
5567         float_raise(float_flag_invalid, status);
5568         return 1ULL << 63;
5569     }
5570     aSig = extractFloatx80Frac( a );
5571     aExp = extractFloatx80Exp( a );
5572     aSign = extractFloatx80Sign( a );
5573     shiftCount = aExp - 0x403E;
5574     if ( 0 <= shiftCount ) {
5575         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5576         if ( ( a.high != 0xC03E ) || aSig ) {
5577             float_raise(float_flag_invalid, status);
5578             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5579                 return INT64_MAX;
5580             }
5581         }
5582         return INT64_MIN;
5583     }
5584     else if ( aExp < 0x3FFF ) {
5585         if (aExp | aSig) {
5586             status->float_exception_flags |= float_flag_inexact;
5587         }
5588         return 0;
5589     }
5590     z = aSig>>( - shiftCount );
5591     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5592         status->float_exception_flags |= float_flag_inexact;
5593     }
5594     if ( aSign ) z = - z;
5595     return z;
5596 
5597 }
5598 
5599 /*----------------------------------------------------------------------------
5600 | Returns the result of converting the extended double-precision floating-
5601 | point value `a' to the single-precision floating-point format.  The
5602 | conversion is performed according to the IEC/IEEE Standard for Binary
5603 | Floating-Point Arithmetic.
5604 *----------------------------------------------------------------------------*/
5605 
5606 float32 floatx80_to_float32(floatx80 a, float_status *status)
5607 {
5608     flag aSign;
5609     int32_t aExp;
5610     uint64_t aSig;
5611 
5612     if (floatx80_invalid_encoding(a)) {
5613         float_raise(float_flag_invalid, status);
5614         return float32_default_nan(status);
5615     }
5616     aSig = extractFloatx80Frac( a );
5617     aExp = extractFloatx80Exp( a );
5618     aSign = extractFloatx80Sign( a );
5619     if ( aExp == 0x7FFF ) {
5620         if ( (uint64_t) ( aSig<<1 ) ) {
5621             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5622         }
5623         return packFloat32( aSign, 0xFF, 0 );
5624     }
5625     shift64RightJamming( aSig, 33, &aSig );
5626     if ( aExp || aSig ) aExp -= 0x3F81;
5627     return roundAndPackFloat32(aSign, aExp, aSig, status);
5628 
5629 }
5630 
5631 /*----------------------------------------------------------------------------
5632 | Returns the result of converting the extended double-precision floating-
5633 | point value `a' to the double-precision floating-point format.  The
5634 | conversion is performed according to the IEC/IEEE Standard for Binary
5635 | Floating-Point Arithmetic.
5636 *----------------------------------------------------------------------------*/
5637 
5638 float64 floatx80_to_float64(floatx80 a, float_status *status)
5639 {
5640     flag aSign;
5641     int32_t aExp;
5642     uint64_t aSig, zSig;
5643 
5644     if (floatx80_invalid_encoding(a)) {
5645         float_raise(float_flag_invalid, status);
5646         return float64_default_nan(status);
5647     }
5648     aSig = extractFloatx80Frac( a );
5649     aExp = extractFloatx80Exp( a );
5650     aSign = extractFloatx80Sign( a );
5651     if ( aExp == 0x7FFF ) {
5652         if ( (uint64_t) ( aSig<<1 ) ) {
5653             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5654         }
5655         return packFloat64( aSign, 0x7FF, 0 );
5656     }
5657     shift64RightJamming( aSig, 1, &zSig );
5658     if ( aExp || aSig ) aExp -= 0x3C01;
5659     return roundAndPackFloat64(aSign, aExp, zSig, status);
5660 
5661 }
5662 
5663 /*----------------------------------------------------------------------------
5664 | Returns the result of converting the extended double-precision floating-
5665 | point value `a' to the quadruple-precision floating-point format.  The
5666 | conversion is performed according to the IEC/IEEE Standard for Binary
5667 | Floating-Point Arithmetic.
5668 *----------------------------------------------------------------------------*/
5669 
5670 float128 floatx80_to_float128(floatx80 a, float_status *status)
5671 {
5672     flag aSign;
5673     int aExp;
5674     uint64_t aSig, zSig0, zSig1;
5675 
5676     if (floatx80_invalid_encoding(a)) {
5677         float_raise(float_flag_invalid, status);
5678         return float128_default_nan(status);
5679     }
5680     aSig = extractFloatx80Frac( a );
5681     aExp = extractFloatx80Exp( a );
5682     aSign = extractFloatx80Sign( a );
5683     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5684         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5685     }
5686     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5687     return packFloat128( aSign, aExp, zSig0, zSig1 );
5688 
5689 }
5690 
5691 /*----------------------------------------------------------------------------
5692 | Rounds the extended double-precision floating-point value `a'
5693 | to the precision provided by floatx80_rounding_precision and returns the
5694 | result as an extended double-precision floating-point value.
5695 | The operation is performed according to the IEC/IEEE Standard for Binary
5696 | Floating-Point Arithmetic.
5697 *----------------------------------------------------------------------------*/
5698 
5699 floatx80 floatx80_round(floatx80 a, float_status *status)
5700 {
5701     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5702                                 extractFloatx80Sign(a),
5703                                 extractFloatx80Exp(a),
5704                                 extractFloatx80Frac(a), 0, status);
5705 }
5706 
5707 /*----------------------------------------------------------------------------
5708 | Rounds the extended double-precision floating-point value `a' to an integer,
5709 | and returns the result as an extended quadruple-precision floating-point
5710 | value.  The operation is performed according to the IEC/IEEE Standard for
5711 | Binary Floating-Point Arithmetic.
5712 *----------------------------------------------------------------------------*/
5713 
5714 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5715 {
5716     flag aSign;
5717     int32_t aExp;
5718     uint64_t lastBitMask, roundBitsMask;
5719     floatx80 z;
5720 
5721     if (floatx80_invalid_encoding(a)) {
5722         float_raise(float_flag_invalid, status);
5723         return floatx80_default_nan(status);
5724     }
5725     aExp = extractFloatx80Exp( a );
5726     if ( 0x403E <= aExp ) {
5727         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5728             return propagateFloatx80NaN(a, a, status);
5729         }
5730         return a;
5731     }
5732     if ( aExp < 0x3FFF ) {
5733         if (    ( aExp == 0 )
5734              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5735             return a;
5736         }
5737         status->float_exception_flags |= float_flag_inexact;
5738         aSign = extractFloatx80Sign( a );
5739         switch (status->float_rounding_mode) {
5740          case float_round_nearest_even:
5741             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5742                ) {
5743                 return
5744                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5745             }
5746             break;
5747         case float_round_ties_away:
5748             if (aExp == 0x3FFE) {
5749                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5750             }
5751             break;
5752          case float_round_down:
5753             return
5754                   aSign ?
5755                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5756                 : packFloatx80( 0, 0, 0 );
5757          case float_round_up:
5758             return
5759                   aSign ? packFloatx80( 1, 0, 0 )
5760                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5761         }
5762         return packFloatx80( aSign, 0, 0 );
5763     }
5764     lastBitMask = 1;
5765     lastBitMask <<= 0x403E - aExp;
5766     roundBitsMask = lastBitMask - 1;
5767     z = a;
5768     switch (status->float_rounding_mode) {
5769     case float_round_nearest_even:
5770         z.low += lastBitMask>>1;
5771         if ((z.low & roundBitsMask) == 0) {
5772             z.low &= ~lastBitMask;
5773         }
5774         break;
5775     case float_round_ties_away:
5776         z.low += lastBitMask >> 1;
5777         break;
5778     case float_round_to_zero:
5779         break;
5780     case float_round_up:
5781         if (!extractFloatx80Sign(z)) {
5782             z.low += roundBitsMask;
5783         }
5784         break;
5785     case float_round_down:
5786         if (extractFloatx80Sign(z)) {
5787             z.low += roundBitsMask;
5788         }
5789         break;
5790     default:
5791         abort();
5792     }
5793     z.low &= ~ roundBitsMask;
5794     if ( z.low == 0 ) {
5795         ++z.high;
5796         z.low = UINT64_C(0x8000000000000000);
5797     }
5798     if (z.low != a.low) {
5799         status->float_exception_flags |= float_flag_inexact;
5800     }
5801     return z;
5802 
5803 }
5804 
5805 /*----------------------------------------------------------------------------
5806 | Returns the result of adding the absolute values of the extended double-
5807 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5808 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5809 | The addition is performed according to the IEC/IEEE Standard for Binary
5810 | Floating-Point Arithmetic.
5811 *----------------------------------------------------------------------------*/
5812 
5813 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5814                                 float_status *status)
5815 {
5816     int32_t aExp, bExp, zExp;
5817     uint64_t aSig, bSig, zSig0, zSig1;
5818     int32_t expDiff;
5819 
5820     aSig = extractFloatx80Frac( a );
5821     aExp = extractFloatx80Exp( a );
5822     bSig = extractFloatx80Frac( b );
5823     bExp = extractFloatx80Exp( b );
5824     expDiff = aExp - bExp;
5825     if ( 0 < expDiff ) {
5826         if ( aExp == 0x7FFF ) {
5827             if ((uint64_t)(aSig << 1)) {
5828                 return propagateFloatx80NaN(a, b, status);
5829             }
5830             return a;
5831         }
5832         if ( bExp == 0 ) --expDiff;
5833         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5834         zExp = aExp;
5835     }
5836     else if ( expDiff < 0 ) {
5837         if ( bExp == 0x7FFF ) {
5838             if ((uint64_t)(bSig << 1)) {
5839                 return propagateFloatx80NaN(a, b, status);
5840             }
5841             return packFloatx80(zSign,
5842                                 floatx80_infinity_high,
5843                                 floatx80_infinity_low);
5844         }
5845         if ( aExp == 0 ) ++expDiff;
5846         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5847         zExp = bExp;
5848     }
5849     else {
5850         if ( aExp == 0x7FFF ) {
5851             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5852                 return propagateFloatx80NaN(a, b, status);
5853             }
5854             return a;
5855         }
5856         zSig1 = 0;
5857         zSig0 = aSig + bSig;
5858         if ( aExp == 0 ) {
5859             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5860             goto roundAndPack;
5861         }
5862         zExp = aExp;
5863         goto shiftRight1;
5864     }
5865     zSig0 = aSig + bSig;
5866     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5867  shiftRight1:
5868     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5869     zSig0 |= UINT64_C(0x8000000000000000);
5870     ++zExp;
5871  roundAndPack:
5872     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5873                                 zSign, zExp, zSig0, zSig1, status);
5874 }
5875 
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of subtracting the absolute values of the extended
5878 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5879 | difference is negated before being returned.  `zSign' is ignored if the
5880 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5881 | Standard for Binary Floating-Point Arithmetic.
5882 *----------------------------------------------------------------------------*/
5883 
5884 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5885                                 float_status *status)
5886 {
5887     int32_t aExp, bExp, zExp;
5888     uint64_t aSig, bSig, zSig0, zSig1;
5889     int32_t expDiff;
5890 
5891     aSig = extractFloatx80Frac( a );
5892     aExp = extractFloatx80Exp( a );
5893     bSig = extractFloatx80Frac( b );
5894     bExp = extractFloatx80Exp( b );
5895     expDiff = aExp - bExp;
5896     if ( 0 < expDiff ) goto aExpBigger;
5897     if ( expDiff < 0 ) goto bExpBigger;
5898     if ( aExp == 0x7FFF ) {
5899         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5900             return propagateFloatx80NaN(a, b, status);
5901         }
5902         float_raise(float_flag_invalid, status);
5903         return floatx80_default_nan(status);
5904     }
5905     if ( aExp == 0 ) {
5906         aExp = 1;
5907         bExp = 1;
5908     }
5909     zSig1 = 0;
5910     if ( bSig < aSig ) goto aBigger;
5911     if ( aSig < bSig ) goto bBigger;
5912     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5913  bExpBigger:
5914     if ( bExp == 0x7FFF ) {
5915         if ((uint64_t)(bSig << 1)) {
5916             return propagateFloatx80NaN(a, b, status);
5917         }
5918         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5919                             floatx80_infinity_low);
5920     }
5921     if ( aExp == 0 ) ++expDiff;
5922     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5923  bBigger:
5924     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5925     zExp = bExp;
5926     zSign ^= 1;
5927     goto normalizeRoundAndPack;
5928  aExpBigger:
5929     if ( aExp == 0x7FFF ) {
5930         if ((uint64_t)(aSig << 1)) {
5931             return propagateFloatx80NaN(a, b, status);
5932         }
5933         return a;
5934     }
5935     if ( bExp == 0 ) --expDiff;
5936     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5937  aBigger:
5938     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5939     zExp = aExp;
5940  normalizeRoundAndPack:
5941     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5942                                          zSign, zExp, zSig0, zSig1, status);
5943 }
5944 
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of adding the extended double-precision floating-point
5947 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5948 | Standard for Binary Floating-Point Arithmetic.
5949 *----------------------------------------------------------------------------*/
5950 
5951 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5952 {
5953     flag aSign, bSign;
5954 
5955     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5956         float_raise(float_flag_invalid, status);
5957         return floatx80_default_nan(status);
5958     }
5959     aSign = extractFloatx80Sign( a );
5960     bSign = extractFloatx80Sign( b );
5961     if ( aSign == bSign ) {
5962         return addFloatx80Sigs(a, b, aSign, status);
5963     }
5964     else {
5965         return subFloatx80Sigs(a, b, aSign, status);
5966     }
5967 
5968 }
5969 
5970 /*----------------------------------------------------------------------------
5971 | Returns the result of subtracting the extended double-precision floating-
5972 | point values `a' and `b'.  The operation is performed according to the
5973 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5974 *----------------------------------------------------------------------------*/
5975 
5976 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5977 {
5978     flag aSign, bSign;
5979 
5980     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5981         float_raise(float_flag_invalid, status);
5982         return floatx80_default_nan(status);
5983     }
5984     aSign = extractFloatx80Sign( a );
5985     bSign = extractFloatx80Sign( b );
5986     if ( aSign == bSign ) {
5987         return subFloatx80Sigs(a, b, aSign, status);
5988     }
5989     else {
5990         return addFloatx80Sigs(a, b, aSign, status);
5991     }
5992 
5993 }
5994 
5995 /*----------------------------------------------------------------------------
5996 | Returns the result of multiplying the extended double-precision floating-
5997 | point values `a' and `b'.  The operation is performed according to the
5998 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5999 *----------------------------------------------------------------------------*/
6000 
6001 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6002 {
6003     flag aSign, bSign, zSign;
6004     int32_t aExp, bExp, zExp;
6005     uint64_t aSig, bSig, zSig0, zSig1;
6006 
6007     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6008         float_raise(float_flag_invalid, status);
6009         return floatx80_default_nan(status);
6010     }
6011     aSig = extractFloatx80Frac( a );
6012     aExp = extractFloatx80Exp( a );
6013     aSign = extractFloatx80Sign( a );
6014     bSig = extractFloatx80Frac( b );
6015     bExp = extractFloatx80Exp( b );
6016     bSign = extractFloatx80Sign( b );
6017     zSign = aSign ^ bSign;
6018     if ( aExp == 0x7FFF ) {
6019         if (    (uint64_t) ( aSig<<1 )
6020              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6021             return propagateFloatx80NaN(a, b, status);
6022         }
6023         if ( ( bExp | bSig ) == 0 ) goto invalid;
6024         return packFloatx80(zSign, floatx80_infinity_high,
6025                                    floatx80_infinity_low);
6026     }
6027     if ( bExp == 0x7FFF ) {
6028         if ((uint64_t)(bSig << 1)) {
6029             return propagateFloatx80NaN(a, b, status);
6030         }
6031         if ( ( aExp | aSig ) == 0 ) {
6032  invalid:
6033             float_raise(float_flag_invalid, status);
6034             return floatx80_default_nan(status);
6035         }
6036         return packFloatx80(zSign, floatx80_infinity_high,
6037                                    floatx80_infinity_low);
6038     }
6039     if ( aExp == 0 ) {
6040         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6041         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6042     }
6043     if ( bExp == 0 ) {
6044         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6045         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6046     }
6047     zExp = aExp + bExp - 0x3FFE;
6048     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6049     if ( 0 < (int64_t) zSig0 ) {
6050         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6051         --zExp;
6052     }
6053     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6054                                 zSign, zExp, zSig0, zSig1, status);
6055 }
6056 
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of dividing the extended double-precision floating-point
6059 | value `a' by the corresponding value `b'.  The operation is performed
6060 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6061 *----------------------------------------------------------------------------*/
6062 
6063 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6064 {
6065     flag aSign, bSign, zSign;
6066     int32_t aExp, bExp, zExp;
6067     uint64_t aSig, bSig, zSig0, zSig1;
6068     uint64_t rem0, rem1, rem2, term0, term1, term2;
6069 
6070     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6071         float_raise(float_flag_invalid, status);
6072         return floatx80_default_nan(status);
6073     }
6074     aSig = extractFloatx80Frac( a );
6075     aExp = extractFloatx80Exp( a );
6076     aSign = extractFloatx80Sign( a );
6077     bSig = extractFloatx80Frac( b );
6078     bExp = extractFloatx80Exp( b );
6079     bSign = extractFloatx80Sign( b );
6080     zSign = aSign ^ bSign;
6081     if ( aExp == 0x7FFF ) {
6082         if ((uint64_t)(aSig << 1)) {
6083             return propagateFloatx80NaN(a, b, status);
6084         }
6085         if ( bExp == 0x7FFF ) {
6086             if ((uint64_t)(bSig << 1)) {
6087                 return propagateFloatx80NaN(a, b, status);
6088             }
6089             goto invalid;
6090         }
6091         return packFloatx80(zSign, floatx80_infinity_high,
6092                                    floatx80_infinity_low);
6093     }
6094     if ( bExp == 0x7FFF ) {
6095         if ((uint64_t)(bSig << 1)) {
6096             return propagateFloatx80NaN(a, b, status);
6097         }
6098         return packFloatx80( zSign, 0, 0 );
6099     }
6100     if ( bExp == 0 ) {
6101         if ( bSig == 0 ) {
6102             if ( ( aExp | aSig ) == 0 ) {
6103  invalid:
6104                 float_raise(float_flag_invalid, status);
6105                 return floatx80_default_nan(status);
6106             }
6107             float_raise(float_flag_divbyzero, status);
6108             return packFloatx80(zSign, floatx80_infinity_high,
6109                                        floatx80_infinity_low);
6110         }
6111         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6112     }
6113     if ( aExp == 0 ) {
6114         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6115         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6116     }
6117     zExp = aExp - bExp + 0x3FFE;
6118     rem1 = 0;
6119     if ( bSig <= aSig ) {
6120         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6121         ++zExp;
6122     }
6123     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6124     mul64To128( bSig, zSig0, &term0, &term1 );
6125     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6126     while ( (int64_t) rem0 < 0 ) {
6127         --zSig0;
6128         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6129     }
6130     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6131     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6132         mul64To128( bSig, zSig1, &term1, &term2 );
6133         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6134         while ( (int64_t) rem1 < 0 ) {
6135             --zSig1;
6136             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6137         }
6138         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6139     }
6140     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6141                                 zSign, zExp, zSig0, zSig1, status);
6142 }
6143 
6144 /*----------------------------------------------------------------------------
6145 | Returns the remainder of the extended double-precision floating-point value
6146 | `a' with respect to the corresponding value `b'.  The operation is performed
6147 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6148 *----------------------------------------------------------------------------*/
6149 
6150 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6151 {
6152     flag aSign, zSign;
6153     int32_t aExp, bExp, expDiff;
6154     uint64_t aSig0, aSig1, bSig;
6155     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6156 
6157     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6158         float_raise(float_flag_invalid, status);
6159         return floatx80_default_nan(status);
6160     }
6161     aSig0 = extractFloatx80Frac( a );
6162     aExp = extractFloatx80Exp( a );
6163     aSign = extractFloatx80Sign( a );
6164     bSig = extractFloatx80Frac( b );
6165     bExp = extractFloatx80Exp( b );
6166     if ( aExp == 0x7FFF ) {
6167         if (    (uint64_t) ( aSig0<<1 )
6168              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6169             return propagateFloatx80NaN(a, b, status);
6170         }
6171         goto invalid;
6172     }
6173     if ( bExp == 0x7FFF ) {
6174         if ((uint64_t)(bSig << 1)) {
6175             return propagateFloatx80NaN(a, b, status);
6176         }
6177         return a;
6178     }
6179     if ( bExp == 0 ) {
6180         if ( bSig == 0 ) {
6181  invalid:
6182             float_raise(float_flag_invalid, status);
6183             return floatx80_default_nan(status);
6184         }
6185         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6186     }
6187     if ( aExp == 0 ) {
6188         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6189         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6190     }
6191     bSig |= UINT64_C(0x8000000000000000);
6192     zSign = aSign;
6193     expDiff = aExp - bExp;
6194     aSig1 = 0;
6195     if ( expDiff < 0 ) {
6196         if ( expDiff < -1 ) return a;
6197         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6198         expDiff = 0;
6199     }
6200     q = ( bSig <= aSig0 );
6201     if ( q ) aSig0 -= bSig;
6202     expDiff -= 64;
6203     while ( 0 < expDiff ) {
6204         q = estimateDiv128To64( aSig0, aSig1, bSig );
6205         q = ( 2 < q ) ? q - 2 : 0;
6206         mul64To128( bSig, q, &term0, &term1 );
6207         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6208         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6209         expDiff -= 62;
6210     }
6211     expDiff += 64;
6212     if ( 0 < expDiff ) {
6213         q = estimateDiv128To64( aSig0, aSig1, bSig );
6214         q = ( 2 < q ) ? q - 2 : 0;
6215         q >>= 64 - expDiff;
6216         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6217         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6218         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6219         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6220             ++q;
6221             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6222         }
6223     }
6224     else {
6225         term1 = 0;
6226         term0 = bSig;
6227     }
6228     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6229     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6230          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6231               && ( q & 1 ) )
6232        ) {
6233         aSig0 = alternateASig0;
6234         aSig1 = alternateASig1;
6235         zSign = ! zSign;
6236     }
6237     return
6238         normalizeRoundAndPackFloatx80(
6239             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6240 
6241 }
6242 
6243 /*----------------------------------------------------------------------------
6244 | Returns the square root of the extended double-precision floating-point
6245 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6246 | for Binary Floating-Point Arithmetic.
6247 *----------------------------------------------------------------------------*/
6248 
6249 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6250 {
6251     flag aSign;
6252     int32_t aExp, zExp;
6253     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6254     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6255 
6256     if (floatx80_invalid_encoding(a)) {
6257         float_raise(float_flag_invalid, status);
6258         return floatx80_default_nan(status);
6259     }
6260     aSig0 = extractFloatx80Frac( a );
6261     aExp = extractFloatx80Exp( a );
6262     aSign = extractFloatx80Sign( a );
6263     if ( aExp == 0x7FFF ) {
6264         if ((uint64_t)(aSig0 << 1)) {
6265             return propagateFloatx80NaN(a, a, status);
6266         }
6267         if ( ! aSign ) return a;
6268         goto invalid;
6269     }
6270     if ( aSign ) {
6271         if ( ( aExp | aSig0 ) == 0 ) return a;
6272  invalid:
6273         float_raise(float_flag_invalid, status);
6274         return floatx80_default_nan(status);
6275     }
6276     if ( aExp == 0 ) {
6277         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6278         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6279     }
6280     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6281     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6282     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6283     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6284     doubleZSig0 = zSig0<<1;
6285     mul64To128( zSig0, zSig0, &term0, &term1 );
6286     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6287     while ( (int64_t) rem0 < 0 ) {
6288         --zSig0;
6289         doubleZSig0 -= 2;
6290         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6291     }
6292     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6293     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6294         if ( zSig1 == 0 ) zSig1 = 1;
6295         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6296         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6297         mul64To128( zSig1, zSig1, &term2, &term3 );
6298         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6299         while ( (int64_t) rem1 < 0 ) {
6300             --zSig1;
6301             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6302             term3 |= 1;
6303             term2 |= doubleZSig0;
6304             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6305         }
6306         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6307     }
6308     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6309     zSig0 |= doubleZSig0;
6310     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6311                                 0, zExp, zSig0, zSig1, status);
6312 }
6313 
6314 /*----------------------------------------------------------------------------
6315 | Returns 1 if the extended double-precision floating-point value `a' is equal
6316 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6317 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6319 *----------------------------------------------------------------------------*/
6320 
6321 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6322 {
6323 
6324     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6325         || (extractFloatx80Exp(a) == 0x7FFF
6326             && (uint64_t) (extractFloatx80Frac(a) << 1))
6327         || (extractFloatx80Exp(b) == 0x7FFF
6328             && (uint64_t) (extractFloatx80Frac(b) << 1))
6329        ) {
6330         float_raise(float_flag_invalid, status);
6331         return 0;
6332     }
6333     return
6334            ( a.low == b.low )
6335         && (    ( a.high == b.high )
6336              || (    ( a.low == 0 )
6337                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6338            );
6339 
6340 }
6341 
6342 /*----------------------------------------------------------------------------
6343 | Returns 1 if the extended double-precision floating-point value `a' is
6344 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6345 | invalid exception is raised if either operand is a NaN.  The comparison is
6346 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6347 | Arithmetic.
6348 *----------------------------------------------------------------------------*/
6349 
6350 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6351 {
6352     flag aSign, bSign;
6353 
6354     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6355         || (extractFloatx80Exp(a) == 0x7FFF
6356             && (uint64_t) (extractFloatx80Frac(a) << 1))
6357         || (extractFloatx80Exp(b) == 0x7FFF
6358             && (uint64_t) (extractFloatx80Frac(b) << 1))
6359        ) {
6360         float_raise(float_flag_invalid, status);
6361         return 0;
6362     }
6363     aSign = extractFloatx80Sign( a );
6364     bSign = extractFloatx80Sign( b );
6365     if ( aSign != bSign ) {
6366         return
6367                aSign
6368             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6369                  == 0 );
6370     }
6371     return
6372           aSign ? le128( b.high, b.low, a.high, a.low )
6373         : le128( a.high, a.low, b.high, b.low );
6374 
6375 }
6376 
6377 /*----------------------------------------------------------------------------
6378 | Returns 1 if the extended double-precision floating-point value `a' is
6379 | less than the corresponding value `b', and 0 otherwise.  The invalid
6380 | exception is raised if either operand is a NaN.  The comparison is performed
6381 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6382 *----------------------------------------------------------------------------*/
6383 
6384 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6385 {
6386     flag aSign, bSign;
6387 
6388     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6389         || (extractFloatx80Exp(a) == 0x7FFF
6390             && (uint64_t) (extractFloatx80Frac(a) << 1))
6391         || (extractFloatx80Exp(b) == 0x7FFF
6392             && (uint64_t) (extractFloatx80Frac(b) << 1))
6393        ) {
6394         float_raise(float_flag_invalid, status);
6395         return 0;
6396     }
6397     aSign = extractFloatx80Sign( a );
6398     bSign = extractFloatx80Sign( b );
6399     if ( aSign != bSign ) {
6400         return
6401                aSign
6402             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6403                  != 0 );
6404     }
6405     return
6406           aSign ? lt128( b.high, b.low, a.high, a.low )
6407         : lt128( a.high, a.low, b.high, b.low );
6408 
6409 }
6410 
6411 /*----------------------------------------------------------------------------
6412 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6413 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6414 | either operand is a NaN.   The comparison is performed according to the
6415 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6416 *----------------------------------------------------------------------------*/
6417 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6418 {
6419     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6420         || (extractFloatx80Exp(a) == 0x7FFF
6421             && (uint64_t) (extractFloatx80Frac(a) << 1))
6422         || (extractFloatx80Exp(b) == 0x7FFF
6423             && (uint64_t) (extractFloatx80Frac(b) << 1))
6424        ) {
6425         float_raise(float_flag_invalid, status);
6426         return 1;
6427     }
6428     return 0;
6429 }
6430 
6431 /*----------------------------------------------------------------------------
6432 | Returns 1 if the extended double-precision floating-point value `a' is
6433 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6434 | cause an exception.  The comparison is performed according to the IEC/IEEE
6435 | Standard for Binary Floating-Point Arithmetic.
6436 *----------------------------------------------------------------------------*/
6437 
6438 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6439 {
6440 
6441     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6442         float_raise(float_flag_invalid, status);
6443         return 0;
6444     }
6445     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6446               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6447          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6448               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6449        ) {
6450         if (floatx80_is_signaling_nan(a, status)
6451          || floatx80_is_signaling_nan(b, status)) {
6452             float_raise(float_flag_invalid, status);
6453         }
6454         return 0;
6455     }
6456     return
6457            ( a.low == b.low )
6458         && (    ( a.high == b.high )
6459              || (    ( a.low == 0 )
6460                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6461            );
6462 
6463 }
6464 
6465 /*----------------------------------------------------------------------------
6466 | Returns 1 if the extended double-precision floating-point value `a' is less
6467 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6468 | do not cause an exception.  Otherwise, the comparison is performed according
6469 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6470 *----------------------------------------------------------------------------*/
6471 
6472 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6473 {
6474     flag aSign, bSign;
6475 
6476     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6477         float_raise(float_flag_invalid, status);
6478         return 0;
6479     }
6480     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6481               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6482          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6483               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6484        ) {
6485         if (floatx80_is_signaling_nan(a, status)
6486          || floatx80_is_signaling_nan(b, status)) {
6487             float_raise(float_flag_invalid, status);
6488         }
6489         return 0;
6490     }
6491     aSign = extractFloatx80Sign( a );
6492     bSign = extractFloatx80Sign( b );
6493     if ( aSign != bSign ) {
6494         return
6495                aSign
6496             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6497                  == 0 );
6498     }
6499     return
6500           aSign ? le128( b.high, b.low, a.high, a.low )
6501         : le128( a.high, a.low, b.high, b.low );
6502 
6503 }
6504 
6505 /*----------------------------------------------------------------------------
6506 | Returns 1 if the extended double-precision floating-point value `a' is less
6507 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6508 | an exception.  Otherwise, the comparison is performed according to the
6509 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6510 *----------------------------------------------------------------------------*/
6511 
6512 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6513 {
6514     flag aSign, bSign;
6515 
6516     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6517         float_raise(float_flag_invalid, status);
6518         return 0;
6519     }
6520     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6521               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6522          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6523               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6524        ) {
6525         if (floatx80_is_signaling_nan(a, status)
6526          || floatx80_is_signaling_nan(b, status)) {
6527             float_raise(float_flag_invalid, status);
6528         }
6529         return 0;
6530     }
6531     aSign = extractFloatx80Sign( a );
6532     bSign = extractFloatx80Sign( b );
6533     if ( aSign != bSign ) {
6534         return
6535                aSign
6536             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6537                  != 0 );
6538     }
6539     return
6540           aSign ? lt128( b.high, b.low, a.high, a.low )
6541         : lt128( a.high, a.low, b.high, b.low );
6542 
6543 }
6544 
6545 /*----------------------------------------------------------------------------
6546 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6547 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6548 | The comparison is performed according to the IEC/IEEE Standard for Binary
6549 | Floating-Point Arithmetic.
6550 *----------------------------------------------------------------------------*/
6551 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6552 {
6553     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6554         float_raise(float_flag_invalid, status);
6555         return 1;
6556     }
6557     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6558               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6559          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6560               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6561        ) {
6562         if (floatx80_is_signaling_nan(a, status)
6563          || floatx80_is_signaling_nan(b, status)) {
6564             float_raise(float_flag_invalid, status);
6565         }
6566         return 1;
6567     }
6568     return 0;
6569 }
6570 
6571 /*----------------------------------------------------------------------------
6572 | Returns the result of converting the quadruple-precision floating-point
6573 | value `a' to the 32-bit two's complement integer format.  The conversion
6574 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6575 | Arithmetic---which means in particular that the conversion is rounded
6576 | according to the current rounding mode.  If `a' is a NaN, the largest
6577 | positive integer is returned.  Otherwise, if the conversion overflows, the
6578 | largest integer with the same sign as `a' is returned.
6579 *----------------------------------------------------------------------------*/
6580 
6581 int32_t float128_to_int32(float128 a, float_status *status)
6582 {
6583     flag aSign;
6584     int32_t aExp, shiftCount;
6585     uint64_t aSig0, aSig1;
6586 
6587     aSig1 = extractFloat128Frac1( a );
6588     aSig0 = extractFloat128Frac0( a );
6589     aExp = extractFloat128Exp( a );
6590     aSign = extractFloat128Sign( a );
6591     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6592     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6593     aSig0 |= ( aSig1 != 0 );
6594     shiftCount = 0x4028 - aExp;
6595     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6596     return roundAndPackInt32(aSign, aSig0, status);
6597 
6598 }
6599 
6600 /*----------------------------------------------------------------------------
6601 | Returns the result of converting the quadruple-precision floating-point
6602 | value `a' to the 32-bit two's complement integer format.  The conversion
6603 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6604 | Arithmetic, except that the conversion is always rounded toward zero.  If
6605 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6606 | conversion overflows, the largest integer with the same sign as `a' is
6607 | returned.
6608 *----------------------------------------------------------------------------*/
6609 
6610 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6611 {
6612     flag aSign;
6613     int32_t aExp, shiftCount;
6614     uint64_t aSig0, aSig1, savedASig;
6615     int32_t z;
6616 
6617     aSig1 = extractFloat128Frac1( a );
6618     aSig0 = extractFloat128Frac0( a );
6619     aExp = extractFloat128Exp( a );
6620     aSign = extractFloat128Sign( a );
6621     aSig0 |= ( aSig1 != 0 );
6622     if ( 0x401E < aExp ) {
6623         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6624         goto invalid;
6625     }
6626     else if ( aExp < 0x3FFF ) {
6627         if (aExp || aSig0) {
6628             status->float_exception_flags |= float_flag_inexact;
6629         }
6630         return 0;
6631     }
6632     aSig0 |= UINT64_C(0x0001000000000000);
6633     shiftCount = 0x402F - aExp;
6634     savedASig = aSig0;
6635     aSig0 >>= shiftCount;
6636     z = aSig0;
6637     if ( aSign ) z = - z;
6638     if ( ( z < 0 ) ^ aSign ) {
6639  invalid:
6640         float_raise(float_flag_invalid, status);
6641         return aSign ? INT32_MIN : INT32_MAX;
6642     }
6643     if ( ( aSig0<<shiftCount ) != savedASig ) {
6644         status->float_exception_flags |= float_flag_inexact;
6645     }
6646     return z;
6647 
6648 }
6649 
6650 /*----------------------------------------------------------------------------
6651 | Returns the result of converting the quadruple-precision floating-point
6652 | value `a' to the 64-bit two's complement integer format.  The conversion
6653 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6654 | Arithmetic---which means in particular that the conversion is rounded
6655 | according to the current rounding mode.  If `a' is a NaN, the largest
6656 | positive integer is returned.  Otherwise, if the conversion overflows, the
6657 | largest integer with the same sign as `a' is returned.
6658 *----------------------------------------------------------------------------*/
6659 
6660 int64_t float128_to_int64(float128 a, float_status *status)
6661 {
6662     flag aSign;
6663     int32_t aExp, shiftCount;
6664     uint64_t aSig0, aSig1;
6665 
6666     aSig1 = extractFloat128Frac1( a );
6667     aSig0 = extractFloat128Frac0( a );
6668     aExp = extractFloat128Exp( a );
6669     aSign = extractFloat128Sign( a );
6670     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6671     shiftCount = 0x402F - aExp;
6672     if ( shiftCount <= 0 ) {
6673         if ( 0x403E < aExp ) {
6674             float_raise(float_flag_invalid, status);
6675             if (    ! aSign
6676                  || (    ( aExp == 0x7FFF )
6677                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6678                     )
6679                ) {
6680                 return INT64_MAX;
6681             }
6682             return INT64_MIN;
6683         }
6684         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6685     }
6686     else {
6687         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6688     }
6689     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6690 
6691 }
6692 
6693 /*----------------------------------------------------------------------------
6694 | Returns the result of converting the quadruple-precision floating-point
6695 | value `a' to the 64-bit two's complement integer format.  The conversion
6696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6697 | Arithmetic, except that the conversion is always rounded toward zero.
6698 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6699 | the conversion overflows, the largest integer with the same sign as `a' is
6700 | returned.
6701 *----------------------------------------------------------------------------*/
6702 
6703 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6704 {
6705     flag aSign;
6706     int32_t aExp, shiftCount;
6707     uint64_t aSig0, aSig1;
6708     int64_t z;
6709 
6710     aSig1 = extractFloat128Frac1( a );
6711     aSig0 = extractFloat128Frac0( a );
6712     aExp = extractFloat128Exp( a );
6713     aSign = extractFloat128Sign( a );
6714     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6715     shiftCount = aExp - 0x402F;
6716     if ( 0 < shiftCount ) {
6717         if ( 0x403E <= aExp ) {
6718             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6719             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6720                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6721                 if (aSig1) {
6722                     status->float_exception_flags |= float_flag_inexact;
6723                 }
6724             }
6725             else {
6726                 float_raise(float_flag_invalid, status);
6727                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6728                     return INT64_MAX;
6729                 }
6730             }
6731             return INT64_MIN;
6732         }
6733         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6734         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6735             status->float_exception_flags |= float_flag_inexact;
6736         }
6737     }
6738     else {
6739         if ( aExp < 0x3FFF ) {
6740             if ( aExp | aSig0 | aSig1 ) {
6741                 status->float_exception_flags |= float_flag_inexact;
6742             }
6743             return 0;
6744         }
6745         z = aSig0>>( - shiftCount );
6746         if (    aSig1
6747              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6748             status->float_exception_flags |= float_flag_inexact;
6749         }
6750     }
6751     if ( aSign ) z = - z;
6752     return z;
6753 
6754 }
6755 
6756 /*----------------------------------------------------------------------------
6757 | Returns the result of converting the quadruple-precision floating-point value
6758 | `a' to the 64-bit unsigned integer format.  The conversion is
6759 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6760 | Arithmetic---which means in particular that the conversion is rounded
6761 | according to the current rounding mode.  If `a' is a NaN, the largest
6762 | positive integer is returned.  If the conversion overflows, the
6763 | largest unsigned integer is returned.  If 'a' is negative, the value is
6764 | rounded and zero is returned; negative values that do not round to zero
6765 | will raise the inexact exception.
6766 *----------------------------------------------------------------------------*/
6767 
6768 uint64_t float128_to_uint64(float128 a, float_status *status)
6769 {
6770     flag aSign;
6771     int aExp;
6772     int shiftCount;
6773     uint64_t aSig0, aSig1;
6774 
6775     aSig0 = extractFloat128Frac0(a);
6776     aSig1 = extractFloat128Frac1(a);
6777     aExp = extractFloat128Exp(a);
6778     aSign = extractFloat128Sign(a);
6779     if (aSign && (aExp > 0x3FFE)) {
6780         float_raise(float_flag_invalid, status);
6781         if (float128_is_any_nan(a)) {
6782             return UINT64_MAX;
6783         } else {
6784             return 0;
6785         }
6786     }
6787     if (aExp) {
6788         aSig0 |= UINT64_C(0x0001000000000000);
6789     }
6790     shiftCount = 0x402F - aExp;
6791     if (shiftCount <= 0) {
6792         if (0x403E < aExp) {
6793             float_raise(float_flag_invalid, status);
6794             return UINT64_MAX;
6795         }
6796         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6797     } else {
6798         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6799     }
6800     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6801 }
6802 
6803 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6804 {
6805     uint64_t v;
6806     signed char current_rounding_mode = status->float_rounding_mode;
6807 
6808     set_float_rounding_mode(float_round_to_zero, status);
6809     v = float128_to_uint64(a, status);
6810     set_float_rounding_mode(current_rounding_mode, status);
6811 
6812     return v;
6813 }
6814 
6815 /*----------------------------------------------------------------------------
6816 | Returns the result of converting the quadruple-precision floating-point
6817 | value `a' to the 32-bit unsigned integer format.  The conversion
6818 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6819 | Arithmetic except that the conversion is always rounded toward zero.
6820 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6821 | if the conversion overflows, the largest unsigned integer is returned.
6822 | If 'a' is negative, the value is rounded and zero is returned; negative
6823 | values that do not round to zero will raise the inexact exception.
6824 *----------------------------------------------------------------------------*/
6825 
6826 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6827 {
6828     uint64_t v;
6829     uint32_t res;
6830     int old_exc_flags = get_float_exception_flags(status);
6831 
6832     v = float128_to_uint64_round_to_zero(a, status);
6833     if (v > 0xffffffff) {
6834         res = 0xffffffff;
6835     } else {
6836         return v;
6837     }
6838     set_float_exception_flags(old_exc_flags, status);
6839     float_raise(float_flag_invalid, status);
6840     return res;
6841 }
6842 
6843 /*----------------------------------------------------------------------------
6844 | Returns the result of converting the quadruple-precision floating-point value
6845 | `a' to the 32-bit unsigned integer format.  The conversion is
6846 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6847 | Arithmetic---which means in particular that the conversion is rounded
6848 | according to the current rounding mode.  If `a' is a NaN, the largest
6849 | positive integer is returned.  If the conversion overflows, the
6850 | largest unsigned integer is returned.  If 'a' is negative, the value is
6851 | rounded and zero is returned; negative values that do not round to zero
6852 | will raise the inexact exception.
6853 *----------------------------------------------------------------------------*/
6854 
6855 uint32_t float128_to_uint32(float128 a, float_status *status)
6856 {
6857     uint64_t v;
6858     uint32_t res;
6859     int old_exc_flags = get_float_exception_flags(status);
6860 
6861     v = float128_to_uint64(a, status);
6862     if (v > 0xffffffff) {
6863         res = 0xffffffff;
6864     } else {
6865         return v;
6866     }
6867     set_float_exception_flags(old_exc_flags, status);
6868     float_raise(float_flag_invalid, status);
6869     return res;
6870 }
6871 
6872 /*----------------------------------------------------------------------------
6873 | Returns the result of converting the quadruple-precision floating-point
6874 | value `a' to the single-precision floating-point format.  The conversion
6875 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6876 | Arithmetic.
6877 *----------------------------------------------------------------------------*/
6878 
6879 float32 float128_to_float32(float128 a, float_status *status)
6880 {
6881     flag aSign;
6882     int32_t aExp;
6883     uint64_t aSig0, aSig1;
6884     uint32_t zSig;
6885 
6886     aSig1 = extractFloat128Frac1( a );
6887     aSig0 = extractFloat128Frac0( a );
6888     aExp = extractFloat128Exp( a );
6889     aSign = extractFloat128Sign( a );
6890     if ( aExp == 0x7FFF ) {
6891         if ( aSig0 | aSig1 ) {
6892             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6893         }
6894         return packFloat32( aSign, 0xFF, 0 );
6895     }
6896     aSig0 |= ( aSig1 != 0 );
6897     shift64RightJamming( aSig0, 18, &aSig0 );
6898     zSig = aSig0;
6899     if ( aExp || zSig ) {
6900         zSig |= 0x40000000;
6901         aExp -= 0x3F81;
6902     }
6903     return roundAndPackFloat32(aSign, aExp, zSig, status);
6904 
6905 }
6906 
6907 /*----------------------------------------------------------------------------
6908 | Returns the result of converting the quadruple-precision floating-point
6909 | value `a' to the double-precision floating-point format.  The conversion
6910 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6911 | Arithmetic.
6912 *----------------------------------------------------------------------------*/
6913 
6914 float64 float128_to_float64(float128 a, float_status *status)
6915 {
6916     flag aSign;
6917     int32_t aExp;
6918     uint64_t aSig0, aSig1;
6919 
6920     aSig1 = extractFloat128Frac1( a );
6921     aSig0 = extractFloat128Frac0( a );
6922     aExp = extractFloat128Exp( a );
6923     aSign = extractFloat128Sign( a );
6924     if ( aExp == 0x7FFF ) {
6925         if ( aSig0 | aSig1 ) {
6926             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6927         }
6928         return packFloat64( aSign, 0x7FF, 0 );
6929     }
6930     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6931     aSig0 |= ( aSig1 != 0 );
6932     if ( aExp || aSig0 ) {
6933         aSig0 |= UINT64_C(0x4000000000000000);
6934         aExp -= 0x3C01;
6935     }
6936     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6937 
6938 }
6939 
6940 /*----------------------------------------------------------------------------
6941 | Returns the result of converting the quadruple-precision floating-point
6942 | value `a' to the extended double-precision floating-point format.  The
6943 | conversion is performed according to the IEC/IEEE Standard for Binary
6944 | Floating-Point Arithmetic.
6945 *----------------------------------------------------------------------------*/
6946 
6947 floatx80 float128_to_floatx80(float128 a, float_status *status)
6948 {
6949     flag aSign;
6950     int32_t aExp;
6951     uint64_t aSig0, aSig1;
6952 
6953     aSig1 = extractFloat128Frac1( a );
6954     aSig0 = extractFloat128Frac0( a );
6955     aExp = extractFloat128Exp( a );
6956     aSign = extractFloat128Sign( a );
6957     if ( aExp == 0x7FFF ) {
6958         if ( aSig0 | aSig1 ) {
6959             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6960         }
6961         return packFloatx80(aSign, floatx80_infinity_high,
6962                                    floatx80_infinity_low);
6963     }
6964     if ( aExp == 0 ) {
6965         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6966         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6967     }
6968     else {
6969         aSig0 |= UINT64_C(0x0001000000000000);
6970     }
6971     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6972     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6973 
6974 }
6975 
6976 /*----------------------------------------------------------------------------
6977 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6978 | returns the result as a quadruple-precision floating-point value.  The
6979 | operation is performed according to the IEC/IEEE Standard for Binary
6980 | Floating-Point Arithmetic.
6981 *----------------------------------------------------------------------------*/
6982 
6983 float128 float128_round_to_int(float128 a, float_status *status)
6984 {
6985     flag aSign;
6986     int32_t aExp;
6987     uint64_t lastBitMask, roundBitsMask;
6988     float128 z;
6989 
6990     aExp = extractFloat128Exp( a );
6991     if ( 0x402F <= aExp ) {
6992         if ( 0x406F <= aExp ) {
6993             if (    ( aExp == 0x7FFF )
6994                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6995                ) {
6996                 return propagateFloat128NaN(a, a, status);
6997             }
6998             return a;
6999         }
7000         lastBitMask = 1;
7001         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7002         roundBitsMask = lastBitMask - 1;
7003         z = a;
7004         switch (status->float_rounding_mode) {
7005         case float_round_nearest_even:
7006             if ( lastBitMask ) {
7007                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7008                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7009             }
7010             else {
7011                 if ( (int64_t) z.low < 0 ) {
7012                     ++z.high;
7013                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7014                 }
7015             }
7016             break;
7017         case float_round_ties_away:
7018             if (lastBitMask) {
7019                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7020             } else {
7021                 if ((int64_t) z.low < 0) {
7022                     ++z.high;
7023                 }
7024             }
7025             break;
7026         case float_round_to_zero:
7027             break;
7028         case float_round_up:
7029             if (!extractFloat128Sign(z)) {
7030                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7031             }
7032             break;
7033         case float_round_down:
7034             if (extractFloat128Sign(z)) {
7035                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7036             }
7037             break;
7038         case float_round_to_odd:
7039             /*
7040              * Note that if lastBitMask == 0, the last bit is the lsb
7041              * of high, and roundBitsMask == -1.
7042              */
7043             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7044                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7045             }
7046             break;
7047         default:
7048             abort();
7049         }
7050         z.low &= ~ roundBitsMask;
7051     }
7052     else {
7053         if ( aExp < 0x3FFF ) {
7054             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7055             status->float_exception_flags |= float_flag_inexact;
7056             aSign = extractFloat128Sign( a );
7057             switch (status->float_rounding_mode) {
7058             case float_round_nearest_even:
7059                 if (    ( aExp == 0x3FFE )
7060                      && (   extractFloat128Frac0( a )
7061                           | extractFloat128Frac1( a ) )
7062                    ) {
7063                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7064                 }
7065                 break;
7066             case float_round_ties_away:
7067                 if (aExp == 0x3FFE) {
7068                     return packFloat128(aSign, 0x3FFF, 0, 0);
7069                 }
7070                 break;
7071             case float_round_down:
7072                 return
7073                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7074                     : packFloat128( 0, 0, 0, 0 );
7075             case float_round_up:
7076                 return
7077                       aSign ? packFloat128( 1, 0, 0, 0 )
7078                     : packFloat128( 0, 0x3FFF, 0, 0 );
7079 
7080             case float_round_to_odd:
7081                 return packFloat128(aSign, 0x3FFF, 0, 0);
7082             }
7083             return packFloat128( aSign, 0, 0, 0 );
7084         }
7085         lastBitMask = 1;
7086         lastBitMask <<= 0x402F - aExp;
7087         roundBitsMask = lastBitMask - 1;
7088         z.low = 0;
7089         z.high = a.high;
7090         switch (status->float_rounding_mode) {
7091         case float_round_nearest_even:
7092             z.high += lastBitMask>>1;
7093             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7094                 z.high &= ~ lastBitMask;
7095             }
7096             break;
7097         case float_round_ties_away:
7098             z.high += lastBitMask>>1;
7099             break;
7100         case float_round_to_zero:
7101             break;
7102         case float_round_up:
7103             if (!extractFloat128Sign(z)) {
7104                 z.high |= ( a.low != 0 );
7105                 z.high += roundBitsMask;
7106             }
7107             break;
7108         case float_round_down:
7109             if (extractFloat128Sign(z)) {
7110                 z.high |= (a.low != 0);
7111                 z.high += roundBitsMask;
7112             }
7113             break;
7114         case float_round_to_odd:
7115             if ((z.high & lastBitMask) == 0) {
7116                 z.high |= (a.low != 0);
7117                 z.high += roundBitsMask;
7118             }
7119             break;
7120         default:
7121             abort();
7122         }
7123         z.high &= ~ roundBitsMask;
7124     }
7125     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7126         status->float_exception_flags |= float_flag_inexact;
7127     }
7128     return z;
7129 
7130 }
7131 
7132 /*----------------------------------------------------------------------------
7133 | Returns the result of adding the absolute values of the quadruple-precision
7134 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7135 | before being returned.  `zSign' is ignored if the result is a NaN.
7136 | The addition is performed according to the IEC/IEEE Standard for Binary
7137 | Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7139 
7140 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7141                                 float_status *status)
7142 {
7143     int32_t aExp, bExp, zExp;
7144     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7145     int32_t expDiff;
7146 
7147     aSig1 = extractFloat128Frac1( a );
7148     aSig0 = extractFloat128Frac0( a );
7149     aExp = extractFloat128Exp( a );
7150     bSig1 = extractFloat128Frac1( b );
7151     bSig0 = extractFloat128Frac0( b );
7152     bExp = extractFloat128Exp( b );
7153     expDiff = aExp - bExp;
7154     if ( 0 < expDiff ) {
7155         if ( aExp == 0x7FFF ) {
7156             if (aSig0 | aSig1) {
7157                 return propagateFloat128NaN(a, b, status);
7158             }
7159             return a;
7160         }
7161         if ( bExp == 0 ) {
7162             --expDiff;
7163         }
7164         else {
7165             bSig0 |= UINT64_C(0x0001000000000000);
7166         }
7167         shift128ExtraRightJamming(
7168             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7169         zExp = aExp;
7170     }
7171     else if ( expDiff < 0 ) {
7172         if ( bExp == 0x7FFF ) {
7173             if (bSig0 | bSig1) {
7174                 return propagateFloat128NaN(a, b, status);
7175             }
7176             return packFloat128( zSign, 0x7FFF, 0, 0 );
7177         }
7178         if ( aExp == 0 ) {
7179             ++expDiff;
7180         }
7181         else {
7182             aSig0 |= UINT64_C(0x0001000000000000);
7183         }
7184         shift128ExtraRightJamming(
7185             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7186         zExp = bExp;
7187     }
7188     else {
7189         if ( aExp == 0x7FFF ) {
7190             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7191                 return propagateFloat128NaN(a, b, status);
7192             }
7193             return a;
7194         }
7195         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7196         if ( aExp == 0 ) {
7197             if (status->flush_to_zero) {
7198                 if (zSig0 | zSig1) {
7199                     float_raise(float_flag_output_denormal, status);
7200                 }
7201                 return packFloat128(zSign, 0, 0, 0);
7202             }
7203             return packFloat128( zSign, 0, zSig0, zSig1 );
7204         }
7205         zSig2 = 0;
7206         zSig0 |= UINT64_C(0x0002000000000000);
7207         zExp = aExp;
7208         goto shiftRight1;
7209     }
7210     aSig0 |= UINT64_C(0x0001000000000000);
7211     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7212     --zExp;
7213     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7214     ++zExp;
7215  shiftRight1:
7216     shift128ExtraRightJamming(
7217         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7218  roundAndPack:
7219     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7220 
7221 }
7222 
7223 /*----------------------------------------------------------------------------
7224 | Returns the result of subtracting the absolute values of the quadruple-
7225 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7226 | difference is negated before being returned.  `zSign' is ignored if the
7227 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7228 | Standard for Binary Floating-Point Arithmetic.
7229 *----------------------------------------------------------------------------*/
7230 
7231 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7232                                 float_status *status)
7233 {
7234     int32_t aExp, bExp, zExp;
7235     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7236     int32_t expDiff;
7237 
7238     aSig1 = extractFloat128Frac1( a );
7239     aSig0 = extractFloat128Frac0( a );
7240     aExp = extractFloat128Exp( a );
7241     bSig1 = extractFloat128Frac1( b );
7242     bSig0 = extractFloat128Frac0( b );
7243     bExp = extractFloat128Exp( b );
7244     expDiff = aExp - bExp;
7245     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7246     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7247     if ( 0 < expDiff ) goto aExpBigger;
7248     if ( expDiff < 0 ) goto bExpBigger;
7249     if ( aExp == 0x7FFF ) {
7250         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7251             return propagateFloat128NaN(a, b, status);
7252         }
7253         float_raise(float_flag_invalid, status);
7254         return float128_default_nan(status);
7255     }
7256     if ( aExp == 0 ) {
7257         aExp = 1;
7258         bExp = 1;
7259     }
7260     if ( bSig0 < aSig0 ) goto aBigger;
7261     if ( aSig0 < bSig0 ) goto bBigger;
7262     if ( bSig1 < aSig1 ) goto aBigger;
7263     if ( aSig1 < bSig1 ) goto bBigger;
7264     return packFloat128(status->float_rounding_mode == float_round_down,
7265                         0, 0, 0);
7266  bExpBigger:
7267     if ( bExp == 0x7FFF ) {
7268         if (bSig0 | bSig1) {
7269             return propagateFloat128NaN(a, b, status);
7270         }
7271         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7272     }
7273     if ( aExp == 0 ) {
7274         ++expDiff;
7275     }
7276     else {
7277         aSig0 |= UINT64_C(0x4000000000000000);
7278     }
7279     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7280     bSig0 |= UINT64_C(0x4000000000000000);
7281  bBigger:
7282     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7283     zExp = bExp;
7284     zSign ^= 1;
7285     goto normalizeRoundAndPack;
7286  aExpBigger:
7287     if ( aExp == 0x7FFF ) {
7288         if (aSig0 | aSig1) {
7289             return propagateFloat128NaN(a, b, status);
7290         }
7291         return a;
7292     }
7293     if ( bExp == 0 ) {
7294         --expDiff;
7295     }
7296     else {
7297         bSig0 |= UINT64_C(0x4000000000000000);
7298     }
7299     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7300     aSig0 |= UINT64_C(0x4000000000000000);
7301  aBigger:
7302     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7303     zExp = aExp;
7304  normalizeRoundAndPack:
7305     --zExp;
7306     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7307                                          status);
7308 
7309 }
7310 
7311 /*----------------------------------------------------------------------------
7312 | Returns the result of adding the quadruple-precision floating-point values
7313 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7314 | for Binary Floating-Point Arithmetic.
7315 *----------------------------------------------------------------------------*/
7316 
7317 float128 float128_add(float128 a, float128 b, float_status *status)
7318 {
7319     flag aSign, bSign;
7320 
7321     aSign = extractFloat128Sign( a );
7322     bSign = extractFloat128Sign( b );
7323     if ( aSign == bSign ) {
7324         return addFloat128Sigs(a, b, aSign, status);
7325     }
7326     else {
7327         return subFloat128Sigs(a, b, aSign, status);
7328     }
7329 
7330 }
7331 
7332 /*----------------------------------------------------------------------------
7333 | Returns the result of subtracting the quadruple-precision floating-point
7334 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7335 | Standard for Binary Floating-Point Arithmetic.
7336 *----------------------------------------------------------------------------*/
7337 
7338 float128 float128_sub(float128 a, float128 b, float_status *status)
7339 {
7340     flag aSign, bSign;
7341 
7342     aSign = extractFloat128Sign( a );
7343     bSign = extractFloat128Sign( b );
7344     if ( aSign == bSign ) {
7345         return subFloat128Sigs(a, b, aSign, status);
7346     }
7347     else {
7348         return addFloat128Sigs(a, b, aSign, status);
7349     }
7350 
7351 }
7352 
7353 /*----------------------------------------------------------------------------
7354 | Returns the result of multiplying the quadruple-precision floating-point
7355 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7356 | Standard for Binary Floating-Point Arithmetic.
7357 *----------------------------------------------------------------------------*/
7358 
7359 float128 float128_mul(float128 a, float128 b, float_status *status)
7360 {
7361     flag aSign, bSign, zSign;
7362     int32_t aExp, bExp, zExp;
7363     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7364 
7365     aSig1 = extractFloat128Frac1( a );
7366     aSig0 = extractFloat128Frac0( a );
7367     aExp = extractFloat128Exp( a );
7368     aSign = extractFloat128Sign( a );
7369     bSig1 = extractFloat128Frac1( b );
7370     bSig0 = extractFloat128Frac0( b );
7371     bExp = extractFloat128Exp( b );
7372     bSign = extractFloat128Sign( b );
7373     zSign = aSign ^ bSign;
7374     if ( aExp == 0x7FFF ) {
7375         if (    ( aSig0 | aSig1 )
7376              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7377             return propagateFloat128NaN(a, b, status);
7378         }
7379         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7380         return packFloat128( zSign, 0x7FFF, 0, 0 );
7381     }
7382     if ( bExp == 0x7FFF ) {
7383         if (bSig0 | bSig1) {
7384             return propagateFloat128NaN(a, b, status);
7385         }
7386         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7387  invalid:
7388             float_raise(float_flag_invalid, status);
7389             return float128_default_nan(status);
7390         }
7391         return packFloat128( zSign, 0x7FFF, 0, 0 );
7392     }
7393     if ( aExp == 0 ) {
7394         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7395         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7396     }
7397     if ( bExp == 0 ) {
7398         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7399         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7400     }
7401     zExp = aExp + bExp - 0x4000;
7402     aSig0 |= UINT64_C(0x0001000000000000);
7403     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7404     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7405     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7406     zSig2 |= ( zSig3 != 0 );
7407     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7408         shift128ExtraRightJamming(
7409             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7410         ++zExp;
7411     }
7412     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7413 
7414 }
7415 
7416 /*----------------------------------------------------------------------------
7417 | Returns the result of dividing the quadruple-precision floating-point value
7418 | `a' by the corresponding value `b'.  The operation is performed according to
7419 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7420 *----------------------------------------------------------------------------*/
7421 
7422 float128 float128_div(float128 a, float128 b, float_status *status)
7423 {
7424     flag aSign, bSign, zSign;
7425     int32_t aExp, bExp, zExp;
7426     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7427     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7428 
7429     aSig1 = extractFloat128Frac1( a );
7430     aSig0 = extractFloat128Frac0( a );
7431     aExp = extractFloat128Exp( a );
7432     aSign = extractFloat128Sign( a );
7433     bSig1 = extractFloat128Frac1( b );
7434     bSig0 = extractFloat128Frac0( b );
7435     bExp = extractFloat128Exp( b );
7436     bSign = extractFloat128Sign( b );
7437     zSign = aSign ^ bSign;
7438     if ( aExp == 0x7FFF ) {
7439         if (aSig0 | aSig1) {
7440             return propagateFloat128NaN(a, b, status);
7441         }
7442         if ( bExp == 0x7FFF ) {
7443             if (bSig0 | bSig1) {
7444                 return propagateFloat128NaN(a, b, status);
7445             }
7446             goto invalid;
7447         }
7448         return packFloat128( zSign, 0x7FFF, 0, 0 );
7449     }
7450     if ( bExp == 0x7FFF ) {
7451         if (bSig0 | bSig1) {
7452             return propagateFloat128NaN(a, b, status);
7453         }
7454         return packFloat128( zSign, 0, 0, 0 );
7455     }
7456     if ( bExp == 0 ) {
7457         if ( ( bSig0 | bSig1 ) == 0 ) {
7458             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7459  invalid:
7460                 float_raise(float_flag_invalid, status);
7461                 return float128_default_nan(status);
7462             }
7463             float_raise(float_flag_divbyzero, status);
7464             return packFloat128( zSign, 0x7FFF, 0, 0 );
7465         }
7466         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7467     }
7468     if ( aExp == 0 ) {
7469         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7470         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7471     }
7472     zExp = aExp - bExp + 0x3FFD;
7473     shortShift128Left(
7474         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7475     shortShift128Left(
7476         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7477     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7478         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7479         ++zExp;
7480     }
7481     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7482     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7483     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7484     while ( (int64_t) rem0 < 0 ) {
7485         --zSig0;
7486         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7487     }
7488     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7489     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7490         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7491         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7492         while ( (int64_t) rem1 < 0 ) {
7493             --zSig1;
7494             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7495         }
7496         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7497     }
7498     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7499     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7500 
7501 }
7502 
7503 /*----------------------------------------------------------------------------
7504 | Returns the remainder of the quadruple-precision floating-point value `a'
7505 | with respect to the corresponding value `b'.  The operation is performed
7506 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7507 *----------------------------------------------------------------------------*/
7508 
7509 float128 float128_rem(float128 a, float128 b, float_status *status)
7510 {
7511     flag aSign, zSign;
7512     int32_t aExp, bExp, expDiff;
7513     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7514     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7515     int64_t sigMean0;
7516 
7517     aSig1 = extractFloat128Frac1( a );
7518     aSig0 = extractFloat128Frac0( a );
7519     aExp = extractFloat128Exp( a );
7520     aSign = extractFloat128Sign( a );
7521     bSig1 = extractFloat128Frac1( b );
7522     bSig0 = extractFloat128Frac0( b );
7523     bExp = extractFloat128Exp( b );
7524     if ( aExp == 0x7FFF ) {
7525         if (    ( aSig0 | aSig1 )
7526              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7527             return propagateFloat128NaN(a, b, status);
7528         }
7529         goto invalid;
7530     }
7531     if ( bExp == 0x7FFF ) {
7532         if (bSig0 | bSig1) {
7533             return propagateFloat128NaN(a, b, status);
7534         }
7535         return a;
7536     }
7537     if ( bExp == 0 ) {
7538         if ( ( bSig0 | bSig1 ) == 0 ) {
7539  invalid:
7540             float_raise(float_flag_invalid, status);
7541             return float128_default_nan(status);
7542         }
7543         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7544     }
7545     if ( aExp == 0 ) {
7546         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7547         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7548     }
7549     expDiff = aExp - bExp;
7550     if ( expDiff < -1 ) return a;
7551     shortShift128Left(
7552         aSig0 | UINT64_C(0x0001000000000000),
7553         aSig1,
7554         15 - ( expDiff < 0 ),
7555         &aSig0,
7556         &aSig1
7557     );
7558     shortShift128Left(
7559         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7560     q = le128( bSig0, bSig1, aSig0, aSig1 );
7561     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7562     expDiff -= 64;
7563     while ( 0 < expDiff ) {
7564         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7565         q = ( 4 < q ) ? q - 4 : 0;
7566         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7567         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7568         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7569         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7570         expDiff -= 61;
7571     }
7572     if ( -64 < expDiff ) {
7573         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7574         q = ( 4 < q ) ? q - 4 : 0;
7575         q >>= - expDiff;
7576         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7577         expDiff += 52;
7578         if ( expDiff < 0 ) {
7579             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7580         }
7581         else {
7582             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7583         }
7584         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7585         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7586     }
7587     else {
7588         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7589         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7590     }
7591     do {
7592         alternateASig0 = aSig0;
7593         alternateASig1 = aSig1;
7594         ++q;
7595         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7596     } while ( 0 <= (int64_t) aSig0 );
7597     add128(
7598         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7599     if (    ( sigMean0 < 0 )
7600          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7601         aSig0 = alternateASig0;
7602         aSig1 = alternateASig1;
7603     }
7604     zSign = ( (int64_t) aSig0 < 0 );
7605     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7606     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7607                                          status);
7608 }
7609 
7610 /*----------------------------------------------------------------------------
7611 | Returns the square root of the quadruple-precision floating-point value `a'.
7612 | The operation is performed according to the IEC/IEEE Standard for Binary
7613 | Floating-Point Arithmetic.
7614 *----------------------------------------------------------------------------*/
7615 
7616 float128 float128_sqrt(float128 a, float_status *status)
7617 {
7618     flag aSign;
7619     int32_t aExp, zExp;
7620     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7621     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7622 
7623     aSig1 = extractFloat128Frac1( a );
7624     aSig0 = extractFloat128Frac0( a );
7625     aExp = extractFloat128Exp( a );
7626     aSign = extractFloat128Sign( a );
7627     if ( aExp == 0x7FFF ) {
7628         if (aSig0 | aSig1) {
7629             return propagateFloat128NaN(a, a, status);
7630         }
7631         if ( ! aSign ) return a;
7632         goto invalid;
7633     }
7634     if ( aSign ) {
7635         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7636  invalid:
7637         float_raise(float_flag_invalid, status);
7638         return float128_default_nan(status);
7639     }
7640     if ( aExp == 0 ) {
7641         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7642         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7643     }
7644     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7645     aSig0 |= UINT64_C(0x0001000000000000);
7646     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7647     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7648     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7649     doubleZSig0 = zSig0<<1;
7650     mul64To128( zSig0, zSig0, &term0, &term1 );
7651     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7652     while ( (int64_t) rem0 < 0 ) {
7653         --zSig0;
7654         doubleZSig0 -= 2;
7655         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7656     }
7657     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7658     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7659         if ( zSig1 == 0 ) zSig1 = 1;
7660         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7661         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7662         mul64To128( zSig1, zSig1, &term2, &term3 );
7663         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7664         while ( (int64_t) rem1 < 0 ) {
7665             --zSig1;
7666             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7667             term3 |= 1;
7668             term2 |= doubleZSig0;
7669             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7670         }
7671         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7672     }
7673     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7674     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7675 
7676 }
7677 
7678 /*----------------------------------------------------------------------------
7679 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7680 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7681 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7682 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7683 *----------------------------------------------------------------------------*/
7684 
7685 int float128_eq(float128 a, float128 b, float_status *status)
7686 {
7687 
7688     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7689               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7690          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7691               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7692        ) {
7693         float_raise(float_flag_invalid, status);
7694         return 0;
7695     }
7696     return
7697            ( a.low == b.low )
7698         && (    ( a.high == b.high )
7699              || (    ( a.low == 0 )
7700                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7701            );
7702 
7703 }
7704 
7705 /*----------------------------------------------------------------------------
7706 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7707 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7708 | exception is raised if either operand is a NaN.  The comparison is performed
7709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7710 *----------------------------------------------------------------------------*/
7711 
7712 int float128_le(float128 a, float128 b, float_status *status)
7713 {
7714     flag aSign, bSign;
7715 
7716     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7717               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7718          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7719               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7720        ) {
7721         float_raise(float_flag_invalid, status);
7722         return 0;
7723     }
7724     aSign = extractFloat128Sign( a );
7725     bSign = extractFloat128Sign( b );
7726     if ( aSign != bSign ) {
7727         return
7728                aSign
7729             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7730                  == 0 );
7731     }
7732     return
7733           aSign ? le128( b.high, b.low, a.high, a.low )
7734         : le128( a.high, a.low, b.high, b.low );
7735 
7736 }
7737 
7738 /*----------------------------------------------------------------------------
7739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7740 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7741 | raised if either operand is a NaN.  The comparison is performed according
7742 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7743 *----------------------------------------------------------------------------*/
7744 
7745 int float128_lt(float128 a, float128 b, float_status *status)
7746 {
7747     flag aSign, bSign;
7748 
7749     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7750               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7751          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7752               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7753        ) {
7754         float_raise(float_flag_invalid, status);
7755         return 0;
7756     }
7757     aSign = extractFloat128Sign( a );
7758     bSign = extractFloat128Sign( b );
7759     if ( aSign != bSign ) {
7760         return
7761                aSign
7762             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7763                  != 0 );
7764     }
7765     return
7766           aSign ? lt128( b.high, b.low, a.high, a.low )
7767         : lt128( a.high, a.low, b.high, b.low );
7768 
7769 }
7770 
7771 /*----------------------------------------------------------------------------
7772 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7773 | be compared, and 0 otherwise.  The invalid exception is raised if either
7774 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7775 | Standard for Binary Floating-Point Arithmetic.
7776 *----------------------------------------------------------------------------*/
7777 
7778 int float128_unordered(float128 a, float128 b, float_status *status)
7779 {
7780     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7781               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7782          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7783               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7784        ) {
7785         float_raise(float_flag_invalid, status);
7786         return 1;
7787     }
7788     return 0;
7789 }
7790 
7791 /*----------------------------------------------------------------------------
7792 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7793 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7794 | exception.  The comparison is performed according to the IEC/IEEE Standard
7795 | for Binary Floating-Point Arithmetic.
7796 *----------------------------------------------------------------------------*/
7797 
7798 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7799 {
7800 
7801     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7802               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7803          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7804               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7805        ) {
7806         if (float128_is_signaling_nan(a, status)
7807          || float128_is_signaling_nan(b, status)) {
7808             float_raise(float_flag_invalid, status);
7809         }
7810         return 0;
7811     }
7812     return
7813            ( a.low == b.low )
7814         && (    ( a.high == b.high )
7815              || (    ( a.low == 0 )
7816                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7817            );
7818 
7819 }
7820 
7821 /*----------------------------------------------------------------------------
7822 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7823 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7824 | cause an exception.  Otherwise, the comparison is performed according to the
7825 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7826 *----------------------------------------------------------------------------*/
7827 
7828 int float128_le_quiet(float128 a, float128 b, float_status *status)
7829 {
7830     flag aSign, bSign;
7831 
7832     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7833               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7834          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7835               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7836        ) {
7837         if (float128_is_signaling_nan(a, status)
7838          || float128_is_signaling_nan(b, status)) {
7839             float_raise(float_flag_invalid, status);
7840         }
7841         return 0;
7842     }
7843     aSign = extractFloat128Sign( a );
7844     bSign = extractFloat128Sign( b );
7845     if ( aSign != bSign ) {
7846         return
7847                aSign
7848             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7849                  == 0 );
7850     }
7851     return
7852           aSign ? le128( b.high, b.low, a.high, a.low )
7853         : le128( a.high, a.low, b.high, b.low );
7854 
7855 }
7856 
7857 /*----------------------------------------------------------------------------
7858 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7859 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7860 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7861 | Standard for Binary Floating-Point Arithmetic.
7862 *----------------------------------------------------------------------------*/
7863 
7864 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7865 {
7866     flag aSign, bSign;
7867 
7868     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7869               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7870          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7871               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7872        ) {
7873         if (float128_is_signaling_nan(a, status)
7874          || float128_is_signaling_nan(b, status)) {
7875             float_raise(float_flag_invalid, status);
7876         }
7877         return 0;
7878     }
7879     aSign = extractFloat128Sign( a );
7880     bSign = extractFloat128Sign( b );
7881     if ( aSign != bSign ) {
7882         return
7883                aSign
7884             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7885                  != 0 );
7886     }
7887     return
7888           aSign ? lt128( b.high, b.low, a.high, a.low )
7889         : lt128( a.high, a.low, b.high, b.low );
7890 
7891 }
7892 
7893 /*----------------------------------------------------------------------------
7894 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7895 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7896 | comparison is performed according to the IEC/IEEE Standard for Binary
7897 | Floating-Point Arithmetic.
7898 *----------------------------------------------------------------------------*/
7899 
7900 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7901 {
7902     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7903               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7904          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7905               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7906        ) {
7907         if (float128_is_signaling_nan(a, status)
7908          || float128_is_signaling_nan(b, status)) {
7909             float_raise(float_flag_invalid, status);
7910         }
7911         return 1;
7912     }
7913     return 0;
7914 }
7915 
7916 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7917                                             int is_quiet, float_status *status)
7918 {
7919     flag aSign, bSign;
7920 
7921     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7922         float_raise(float_flag_invalid, status);
7923         return float_relation_unordered;
7924     }
7925     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7926           ( extractFloatx80Frac( a )<<1 ) ) ||
7927         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7928           ( extractFloatx80Frac( b )<<1 ) )) {
7929         if (!is_quiet ||
7930             floatx80_is_signaling_nan(a, status) ||
7931             floatx80_is_signaling_nan(b, status)) {
7932             float_raise(float_flag_invalid, status);
7933         }
7934         return float_relation_unordered;
7935     }
7936     aSign = extractFloatx80Sign( a );
7937     bSign = extractFloatx80Sign( b );
7938     if ( aSign != bSign ) {
7939 
7940         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7941              ( ( a.low | b.low ) == 0 ) ) {
7942             /* zero case */
7943             return float_relation_equal;
7944         } else {
7945             return 1 - (2 * aSign);
7946         }
7947     } else {
7948         if (a.low == b.low && a.high == b.high) {
7949             return float_relation_equal;
7950         } else {
7951             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7952         }
7953     }
7954 }
7955 
7956 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7957 {
7958     return floatx80_compare_internal(a, b, 0, status);
7959 }
7960 
7961 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7962 {
7963     return floatx80_compare_internal(a, b, 1, status);
7964 }
7965 
7966 static inline int float128_compare_internal(float128 a, float128 b,
7967                                             int is_quiet, float_status *status)
7968 {
7969     flag aSign, bSign;
7970 
7971     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7972           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7973         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7974           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7975         if (!is_quiet ||
7976             float128_is_signaling_nan(a, status) ||
7977             float128_is_signaling_nan(b, status)) {
7978             float_raise(float_flag_invalid, status);
7979         }
7980         return float_relation_unordered;
7981     }
7982     aSign = extractFloat128Sign( a );
7983     bSign = extractFloat128Sign( b );
7984     if ( aSign != bSign ) {
7985         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7986             /* zero case */
7987             return float_relation_equal;
7988         } else {
7989             return 1 - (2 * aSign);
7990         }
7991     } else {
7992         if (a.low == b.low && a.high == b.high) {
7993             return float_relation_equal;
7994         } else {
7995             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7996         }
7997     }
7998 }
7999 
8000 int float128_compare(float128 a, float128 b, float_status *status)
8001 {
8002     return float128_compare_internal(a, b, 0, status);
8003 }
8004 
8005 int float128_compare_quiet(float128 a, float128 b, float_status *status)
8006 {
8007     return float128_compare_internal(a, b, 1, status);
8008 }
8009 
8010 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8011 {
8012     flag aSign;
8013     int32_t aExp;
8014     uint64_t aSig;
8015 
8016     if (floatx80_invalid_encoding(a)) {
8017         float_raise(float_flag_invalid, status);
8018         return floatx80_default_nan(status);
8019     }
8020     aSig = extractFloatx80Frac( a );
8021     aExp = extractFloatx80Exp( a );
8022     aSign = extractFloatx80Sign( a );
8023 
8024     if ( aExp == 0x7FFF ) {
8025         if ( aSig<<1 ) {
8026             return propagateFloatx80NaN(a, a, status);
8027         }
8028         return a;
8029     }
8030 
8031     if (aExp == 0) {
8032         if (aSig == 0) {
8033             return a;
8034         }
8035         aExp++;
8036     }
8037 
8038     if (n > 0x10000) {
8039         n = 0x10000;
8040     } else if (n < -0x10000) {
8041         n = -0x10000;
8042     }
8043 
8044     aExp += n;
8045     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8046                                          aSign, aExp, aSig, 0, status);
8047 }
8048 
8049 float128 float128_scalbn(float128 a, int n, float_status *status)
8050 {
8051     flag aSign;
8052     int32_t aExp;
8053     uint64_t aSig0, aSig1;
8054 
8055     aSig1 = extractFloat128Frac1( a );
8056     aSig0 = extractFloat128Frac0( a );
8057     aExp = extractFloat128Exp( a );
8058     aSign = extractFloat128Sign( a );
8059     if ( aExp == 0x7FFF ) {
8060         if ( aSig0 | aSig1 ) {
8061             return propagateFloat128NaN(a, a, status);
8062         }
8063         return a;
8064     }
8065     if (aExp != 0) {
8066         aSig0 |= UINT64_C(0x0001000000000000);
8067     } else if (aSig0 == 0 && aSig1 == 0) {
8068         return a;
8069     } else {
8070         aExp++;
8071     }
8072 
8073     if (n > 0x10000) {
8074         n = 0x10000;
8075     } else if (n < -0x10000) {
8076         n = -0x10000;
8077     }
8078 
8079     aExp += n - 1;
8080     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8081                                          , status);
8082 
8083 }
8084 
8085 static void __attribute__((constructor)) softfloat_init(void)
8086 {
8087     union_float64 ua, ub, uc, ur;
8088 
8089     if (QEMU_NO_HARDFLOAT) {
8090         return;
8091     }
8092     /*
8093      * Test that the host's FMA is not obviously broken. For example,
8094      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8095      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8096      */
8097     ua.s = 0x0020000000000001ULL;
8098     ub.s = 0x3ca0000000000000ULL;
8099     uc.s = 0x0020000000000000ULL;
8100     ur.h = fma(ua.h, ub.h, uc.h);
8101     if (ur.s != 0x0020000000000001ULL) {
8102         force_soft_fma = true;
8103     }
8104 }
8105