xref: /openbmc/qemu/fpu/softfloat.c (revision 2df9f571)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the single-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat32Frac(float32 a)
422 {
423     return float32_val(a) & 0x007FFFFF;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the single-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat32Exp(float32 a)
431 {
432     return (float32_val(a) >> 23) & 0xFF;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the sign bit of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline flag extractFloat32Sign(float32 a)
440 {
441     return float32_val(a) >> 31;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the fraction bits of the double-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline uint64_t extractFloat64Frac(float64 a)
449 {
450     return float64_val(a) & UINT64_C(0x000FFFFFFFFFFFFF);
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the exponent bits of the double-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline int extractFloat64Exp(float64 a)
458 {
459     return (float64_val(a) >> 52) & 0x7FF;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the sign bit of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline flag extractFloat64Sign(float64 a)
467 {
468     return float64_val(a) >> 63;
469 }
470 
471 /*
472  * Classify a floating point number. Everything above float_class_qnan
473  * is a NaN so cls >= float_class_qnan is any NaN.
474  */
475 
476 typedef enum __attribute__ ((__packed__)) {
477     float_class_unclassified,
478     float_class_zero,
479     float_class_normal,
480     float_class_inf,
481     float_class_qnan,  /* all NaNs from here */
482     float_class_snan,
483 } FloatClass;
484 
485 /* Simple helpers for checking if, or what kind of, NaN we have */
486 static inline __attribute__((unused)) bool is_nan(FloatClass c)
487 {
488     return unlikely(c >= float_class_qnan);
489 }
490 
491 static inline __attribute__((unused)) bool is_snan(FloatClass c)
492 {
493     return c == float_class_snan;
494 }
495 
496 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
497 {
498     return c == float_class_qnan;
499 }
500 
501 /*
502  * Structure holding all of the decomposed parts of a float. The
503  * exponent is unbiased and the fraction is normalized. All
504  * calculations are done with a 64 bit fraction and then rounded as
505  * appropriate for the final format.
506  *
507  * Thanks to the packed FloatClass a decent compiler should be able to
508  * fit the whole structure into registers and avoid using the stack
509  * for parameter passing.
510  */
511 
512 typedef struct {
513     uint64_t frac;
514     int32_t  exp;
515     FloatClass cls;
516     bool sign;
517 } FloatParts;
518 
519 #define DECOMPOSED_BINARY_POINT    (64 - 2)
520 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
521 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
522 
523 /* Structure holding all of the relevant parameters for a format.
524  *   exp_size: the size of the exponent field
525  *   exp_bias: the offset applied to the exponent field
526  *   exp_max: the maximum normalised exponent
527  *   frac_size: the size of the fraction field
528  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
529  * The following are computed based the size of fraction
530  *   frac_lsb: least significant bit of fraction
531  *   frac_lsbm1: the bit below the least significant bit (for rounding)
532  *   round_mask/roundeven_mask: masks used for rounding
533  * The following optional modifiers are available:
534  *   arm_althp: handle ARM Alternative Half Precision
535  */
536 typedef struct {
537     int exp_size;
538     int exp_bias;
539     int exp_max;
540     int frac_size;
541     int frac_shift;
542     uint64_t frac_lsb;
543     uint64_t frac_lsbm1;
544     uint64_t round_mask;
545     uint64_t roundeven_mask;
546     bool arm_althp;
547 } FloatFmt;
548 
549 /* Expand fields based on the size of exponent and fraction */
550 #define FLOAT_PARAMS(E, F)                                           \
551     .exp_size       = E,                                             \
552     .exp_bias       = ((1 << E) - 1) >> 1,                           \
553     .exp_max        = (1 << E) - 1,                                  \
554     .frac_size      = F,                                             \
555     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
556     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
557     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
558     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
559     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
560 
561 static const FloatFmt float16_params = {
562     FLOAT_PARAMS(5, 10)
563 };
564 
565 static const FloatFmt float16_params_ahp = {
566     FLOAT_PARAMS(5, 10),
567     .arm_althp = true
568 };
569 
570 static const FloatFmt float32_params = {
571     FLOAT_PARAMS(8, 23)
572 };
573 
574 static const FloatFmt float64_params = {
575     FLOAT_PARAMS(11, 52)
576 };
577 
578 /* Unpack a float to parts, but do not canonicalize.  */
579 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
580 {
581     const int sign_pos = fmt.frac_size + fmt.exp_size;
582 
583     return (FloatParts) {
584         .cls = float_class_unclassified,
585         .sign = extract64(raw, sign_pos, 1),
586         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
587         .frac = extract64(raw, 0, fmt.frac_size),
588     };
589 }
590 
591 static inline FloatParts float16_unpack_raw(float16 f)
592 {
593     return unpack_raw(float16_params, f);
594 }
595 
596 static inline FloatParts float32_unpack_raw(float32 f)
597 {
598     return unpack_raw(float32_params, f);
599 }
600 
601 static inline FloatParts float64_unpack_raw(float64 f)
602 {
603     return unpack_raw(float64_params, f);
604 }
605 
606 /* Pack a float from parts, but do not canonicalize.  */
607 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
608 {
609     const int sign_pos = fmt.frac_size + fmt.exp_size;
610     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
611     return deposit64(ret, sign_pos, 1, p.sign);
612 }
613 
614 static inline float16 float16_pack_raw(FloatParts p)
615 {
616     return make_float16(pack_raw(float16_params, p));
617 }
618 
619 static inline float32 float32_pack_raw(FloatParts p)
620 {
621     return make_float32(pack_raw(float32_params, p));
622 }
623 
624 static inline float64 float64_pack_raw(FloatParts p)
625 {
626     return make_float64(pack_raw(float64_params, p));
627 }
628 
629 /*----------------------------------------------------------------------------
630 | Functions and definitions to determine:  (1) whether tininess for underflow
631 | is detected before or after rounding by default, (2) what (if anything)
632 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
633 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
634 | are propagated from function inputs to output.  These details are target-
635 | specific.
636 *----------------------------------------------------------------------------*/
637 #include "softfloat-specialize.inc.c"
638 
639 /* Canonicalize EXP and FRAC, setting CLS.  */
640 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
641                                   float_status *status)
642 {
643     if (part.exp == parm->exp_max && !parm->arm_althp) {
644         if (part.frac == 0) {
645             part.cls = float_class_inf;
646         } else {
647             part.frac <<= parm->frac_shift;
648             part.cls = (parts_is_snan_frac(part.frac, status)
649                         ? float_class_snan : float_class_qnan);
650         }
651     } else if (part.exp == 0) {
652         if (likely(part.frac == 0)) {
653             part.cls = float_class_zero;
654         } else if (status->flush_inputs_to_zero) {
655             float_raise(float_flag_input_denormal, status);
656             part.cls = float_class_zero;
657             part.frac = 0;
658         } else {
659             int shift = clz64(part.frac) - 1;
660             part.cls = float_class_normal;
661             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
662             part.frac <<= shift;
663         }
664     } else {
665         part.cls = float_class_normal;
666         part.exp -= parm->exp_bias;
667         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
668     }
669     return part;
670 }
671 
672 /* Round and uncanonicalize a floating-point number by parts. There
673  * are FRAC_SHIFT bits that may require rounding at the bottom of the
674  * fraction; these bits will be removed. The exponent will be biased
675  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
676  */
677 
678 static FloatParts round_canonical(FloatParts p, float_status *s,
679                                   const FloatFmt *parm)
680 {
681     const uint64_t frac_lsb = parm->frac_lsb;
682     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
683     const uint64_t round_mask = parm->round_mask;
684     const uint64_t roundeven_mask = parm->roundeven_mask;
685     const int exp_max = parm->exp_max;
686     const int frac_shift = parm->frac_shift;
687     uint64_t frac, inc;
688     int exp, flags = 0;
689     bool overflow_norm;
690 
691     frac = p.frac;
692     exp = p.exp;
693 
694     switch (p.cls) {
695     case float_class_normal:
696         switch (s->float_rounding_mode) {
697         case float_round_nearest_even:
698             overflow_norm = false;
699             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
700             break;
701         case float_round_ties_away:
702             overflow_norm = false;
703             inc = frac_lsbm1;
704             break;
705         case float_round_to_zero:
706             overflow_norm = true;
707             inc = 0;
708             break;
709         case float_round_up:
710             inc = p.sign ? 0 : round_mask;
711             overflow_norm = p.sign;
712             break;
713         case float_round_down:
714             inc = p.sign ? round_mask : 0;
715             overflow_norm = !p.sign;
716             break;
717         case float_round_to_odd:
718             overflow_norm = true;
719             inc = frac & frac_lsb ? 0 : round_mask;
720             break;
721         default:
722             g_assert_not_reached();
723         }
724 
725         exp += parm->exp_bias;
726         if (likely(exp > 0)) {
727             if (frac & round_mask) {
728                 flags |= float_flag_inexact;
729                 frac += inc;
730                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
731                     frac >>= 1;
732                     exp++;
733                 }
734             }
735             frac >>= frac_shift;
736 
737             if (parm->arm_althp) {
738                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
739                 if (unlikely(exp > exp_max)) {
740                     /* Overflow.  Return the maximum normal.  */
741                     flags = float_flag_invalid;
742                     exp = exp_max;
743                     frac = -1;
744                 }
745             } else if (unlikely(exp >= exp_max)) {
746                 flags |= float_flag_overflow | float_flag_inexact;
747                 if (overflow_norm) {
748                     exp = exp_max - 1;
749                     frac = -1;
750                 } else {
751                     p.cls = float_class_inf;
752                     goto do_inf;
753                 }
754             }
755         } else if (s->flush_to_zero) {
756             flags |= float_flag_output_denormal;
757             p.cls = float_class_zero;
758             goto do_zero;
759         } else {
760             bool is_tiny = (s->float_detect_tininess
761                             == float_tininess_before_rounding)
762                         || (exp < 0)
763                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
764 
765             shift64RightJamming(frac, 1 - exp, &frac);
766             if (frac & round_mask) {
767                 /* Need to recompute round-to-even.  */
768                 switch (s->float_rounding_mode) {
769                 case float_round_nearest_even:
770                     inc = ((frac & roundeven_mask) != frac_lsbm1
771                            ? frac_lsbm1 : 0);
772                     break;
773                 case float_round_to_odd:
774                     inc = frac & frac_lsb ? 0 : round_mask;
775                     break;
776                 }
777                 flags |= float_flag_inexact;
778                 frac += inc;
779             }
780 
781             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
782             frac >>= frac_shift;
783 
784             if (is_tiny && (flags & float_flag_inexact)) {
785                 flags |= float_flag_underflow;
786             }
787             if (exp == 0 && frac == 0) {
788                 p.cls = float_class_zero;
789             }
790         }
791         break;
792 
793     case float_class_zero:
794     do_zero:
795         exp = 0;
796         frac = 0;
797         break;
798 
799     case float_class_inf:
800     do_inf:
801         assert(!parm->arm_althp);
802         exp = exp_max;
803         frac = 0;
804         break;
805 
806     case float_class_qnan:
807     case float_class_snan:
808         assert(!parm->arm_althp);
809         exp = exp_max;
810         frac >>= parm->frac_shift;
811         break;
812 
813     default:
814         g_assert_not_reached();
815     }
816 
817     float_raise(flags, s);
818     p.exp = exp;
819     p.frac = frac;
820     return p;
821 }
822 
823 /* Explicit FloatFmt version */
824 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
825                                             const FloatFmt *params)
826 {
827     return sf_canonicalize(float16_unpack_raw(f), params, s);
828 }
829 
830 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
831 {
832     return float16a_unpack_canonical(f, s, &float16_params);
833 }
834 
835 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
836                                              const FloatFmt *params)
837 {
838     return float16_pack_raw(round_canonical(p, s, params));
839 }
840 
841 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
842 {
843     return float16a_round_pack_canonical(p, s, &float16_params);
844 }
845 
846 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
847 {
848     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
849 }
850 
851 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
852 {
853     return float32_pack_raw(round_canonical(p, s, &float32_params));
854 }
855 
856 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
857 {
858     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
859 }
860 
861 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
862 {
863     return float64_pack_raw(round_canonical(p, s, &float64_params));
864 }
865 
866 static FloatParts return_nan(FloatParts a, float_status *s)
867 {
868     switch (a.cls) {
869     case float_class_snan:
870         s->float_exception_flags |= float_flag_invalid;
871         a = parts_silence_nan(a, s);
872         /* fall through */
873     case float_class_qnan:
874         if (s->default_nan_mode) {
875             return parts_default_nan(s);
876         }
877         break;
878 
879     default:
880         g_assert_not_reached();
881     }
882     return a;
883 }
884 
885 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
886 {
887     if (is_snan(a.cls) || is_snan(b.cls)) {
888         s->float_exception_flags |= float_flag_invalid;
889     }
890 
891     if (s->default_nan_mode) {
892         return parts_default_nan(s);
893     } else {
894         if (pickNaN(a.cls, b.cls,
895                     a.frac > b.frac ||
896                     (a.frac == b.frac && a.sign < b.sign))) {
897             a = b;
898         }
899         if (is_snan(a.cls)) {
900             return parts_silence_nan(a, s);
901         }
902     }
903     return a;
904 }
905 
906 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
907                                   bool inf_zero, float_status *s)
908 {
909     int which;
910 
911     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
912         s->float_exception_flags |= float_flag_invalid;
913     }
914 
915     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
916 
917     if (s->default_nan_mode) {
918         /* Note that this check is after pickNaNMulAdd so that function
919          * has an opportunity to set the Invalid flag.
920          */
921         which = 3;
922     }
923 
924     switch (which) {
925     case 0:
926         break;
927     case 1:
928         a = b;
929         break;
930     case 2:
931         a = c;
932         break;
933     case 3:
934         return parts_default_nan(s);
935     default:
936         g_assert_not_reached();
937     }
938 
939     if (is_snan(a.cls)) {
940         return parts_silence_nan(a, s);
941     }
942     return a;
943 }
944 
945 /*
946  * Returns the result of adding or subtracting the values of the
947  * floating-point values `a' and `b'. The operation is performed
948  * according to the IEC/IEEE Standard for Binary Floating-Point
949  * Arithmetic.
950  */
951 
952 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
953                                 float_status *s)
954 {
955     bool a_sign = a.sign;
956     bool b_sign = b.sign ^ subtract;
957 
958     if (a_sign != b_sign) {
959         /* Subtraction */
960 
961         if (a.cls == float_class_normal && b.cls == float_class_normal) {
962             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
963                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
964                 a.frac = a.frac - b.frac;
965             } else {
966                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
967                 a.frac = b.frac - a.frac;
968                 a.exp = b.exp;
969                 a_sign ^= 1;
970             }
971 
972             if (a.frac == 0) {
973                 a.cls = float_class_zero;
974                 a.sign = s->float_rounding_mode == float_round_down;
975             } else {
976                 int shift = clz64(a.frac) - 1;
977                 a.frac = a.frac << shift;
978                 a.exp = a.exp - shift;
979                 a.sign = a_sign;
980             }
981             return a;
982         }
983         if (is_nan(a.cls) || is_nan(b.cls)) {
984             return pick_nan(a, b, s);
985         }
986         if (a.cls == float_class_inf) {
987             if (b.cls == float_class_inf) {
988                 float_raise(float_flag_invalid, s);
989                 return parts_default_nan(s);
990             }
991             return a;
992         }
993         if (a.cls == float_class_zero && b.cls == float_class_zero) {
994             a.sign = s->float_rounding_mode == float_round_down;
995             return a;
996         }
997         if (a.cls == float_class_zero || b.cls == float_class_inf) {
998             b.sign = a_sign ^ 1;
999             return b;
1000         }
1001         if (b.cls == float_class_zero) {
1002             return a;
1003         }
1004     } else {
1005         /* Addition */
1006         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1007             if (a.exp > b.exp) {
1008                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1009             } else if (a.exp < b.exp) {
1010                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1011                 a.exp = b.exp;
1012             }
1013             a.frac += b.frac;
1014             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1015                 shift64RightJamming(a.frac, 1, &a.frac);
1016                 a.exp += 1;
1017             }
1018             return a;
1019         }
1020         if (is_nan(a.cls) || is_nan(b.cls)) {
1021             return pick_nan(a, b, s);
1022         }
1023         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1024             return a;
1025         }
1026         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1027             b.sign = b_sign;
1028             return b;
1029         }
1030     }
1031     g_assert_not_reached();
1032 }
1033 
1034 /*
1035  * Returns the result of adding or subtracting the floating-point
1036  * values `a' and `b'. The operation is performed according to the
1037  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1038  */
1039 
1040 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1041 {
1042     FloatParts pa = float16_unpack_canonical(a, status);
1043     FloatParts pb = float16_unpack_canonical(b, status);
1044     FloatParts pr = addsub_floats(pa, pb, false, status);
1045 
1046     return float16_round_pack_canonical(pr, status);
1047 }
1048 
1049 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1050 {
1051     FloatParts pa = float16_unpack_canonical(a, status);
1052     FloatParts pb = float16_unpack_canonical(b, status);
1053     FloatParts pr = addsub_floats(pa, pb, true, status);
1054 
1055     return float16_round_pack_canonical(pr, status);
1056 }
1057 
1058 static float32 QEMU_SOFTFLOAT_ATTR
1059 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1060 {
1061     FloatParts pa = float32_unpack_canonical(a, status);
1062     FloatParts pb = float32_unpack_canonical(b, status);
1063     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1064 
1065     return float32_round_pack_canonical(pr, status);
1066 }
1067 
1068 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1069 {
1070     return soft_f32_addsub(a, b, false, status);
1071 }
1072 
1073 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1074 {
1075     return soft_f32_addsub(a, b, true, status);
1076 }
1077 
1078 static float64 QEMU_SOFTFLOAT_ATTR
1079 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1080 {
1081     FloatParts pa = float64_unpack_canonical(a, status);
1082     FloatParts pb = float64_unpack_canonical(b, status);
1083     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1084 
1085     return float64_round_pack_canonical(pr, status);
1086 }
1087 
1088 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1089 {
1090     return soft_f64_addsub(a, b, false, status);
1091 }
1092 
1093 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1094 {
1095     return soft_f64_addsub(a, b, true, status);
1096 }
1097 
1098 static float hard_f32_add(float a, float b)
1099 {
1100     return a + b;
1101 }
1102 
1103 static float hard_f32_sub(float a, float b)
1104 {
1105     return a - b;
1106 }
1107 
1108 static double hard_f64_add(double a, double b)
1109 {
1110     return a + b;
1111 }
1112 
1113 static double hard_f64_sub(double a, double b)
1114 {
1115     return a - b;
1116 }
1117 
1118 static bool f32_addsub_post(union_float32 a, union_float32 b)
1119 {
1120     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1121         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1122     }
1123     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1124 }
1125 
1126 static bool f64_addsub_post(union_float64 a, union_float64 b)
1127 {
1128     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1129         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1130     } else {
1131         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1132     }
1133 }
1134 
1135 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1136                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1137 {
1138     return float32_gen2(a, b, s, hard, soft,
1139                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1140 }
1141 
1142 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1143                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1144 {
1145     return float64_gen2(a, b, s, hard, soft,
1146                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1147 }
1148 
1149 float32 QEMU_FLATTEN
1150 float32_add(float32 a, float32 b, float_status *s)
1151 {
1152     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1153 }
1154 
1155 float32 QEMU_FLATTEN
1156 float32_sub(float32 a, float32 b, float_status *s)
1157 {
1158     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1159 }
1160 
1161 float64 QEMU_FLATTEN
1162 float64_add(float64 a, float64 b, float_status *s)
1163 {
1164     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1165 }
1166 
1167 float64 QEMU_FLATTEN
1168 float64_sub(float64 a, float64 b, float_status *s)
1169 {
1170     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1171 }
1172 
1173 /*
1174  * Returns the result of multiplying the floating-point values `a' and
1175  * `b'. The operation is performed according to the IEC/IEEE Standard
1176  * for Binary Floating-Point Arithmetic.
1177  */
1178 
1179 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1180 {
1181     bool sign = a.sign ^ b.sign;
1182 
1183     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1184         uint64_t hi, lo;
1185         int exp = a.exp + b.exp;
1186 
1187         mul64To128(a.frac, b.frac, &hi, &lo);
1188         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1189         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1190             shift64RightJamming(lo, 1, &lo);
1191             exp += 1;
1192         }
1193 
1194         /* Re-use a */
1195         a.exp = exp;
1196         a.sign = sign;
1197         a.frac = lo;
1198         return a;
1199     }
1200     /* handle all the NaN cases */
1201     if (is_nan(a.cls) || is_nan(b.cls)) {
1202         return pick_nan(a, b, s);
1203     }
1204     /* Inf * Zero == NaN */
1205     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1206         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1207         s->float_exception_flags |= float_flag_invalid;
1208         return parts_default_nan(s);
1209     }
1210     /* Multiply by 0 or Inf */
1211     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1212         a.sign = sign;
1213         return a;
1214     }
1215     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1216         b.sign = sign;
1217         return b;
1218     }
1219     g_assert_not_reached();
1220 }
1221 
1222 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1223 {
1224     FloatParts pa = float16_unpack_canonical(a, status);
1225     FloatParts pb = float16_unpack_canonical(b, status);
1226     FloatParts pr = mul_floats(pa, pb, status);
1227 
1228     return float16_round_pack_canonical(pr, status);
1229 }
1230 
1231 static float32 QEMU_SOFTFLOAT_ATTR
1232 soft_f32_mul(float32 a, float32 b, float_status *status)
1233 {
1234     FloatParts pa = float32_unpack_canonical(a, status);
1235     FloatParts pb = float32_unpack_canonical(b, status);
1236     FloatParts pr = mul_floats(pa, pb, status);
1237 
1238     return float32_round_pack_canonical(pr, status);
1239 }
1240 
1241 static float64 QEMU_SOFTFLOAT_ATTR
1242 soft_f64_mul(float64 a, float64 b, float_status *status)
1243 {
1244     FloatParts pa = float64_unpack_canonical(a, status);
1245     FloatParts pb = float64_unpack_canonical(b, status);
1246     FloatParts pr = mul_floats(pa, pb, status);
1247 
1248     return float64_round_pack_canonical(pr, status);
1249 }
1250 
1251 static float hard_f32_mul(float a, float b)
1252 {
1253     return a * b;
1254 }
1255 
1256 static double hard_f64_mul(double a, double b)
1257 {
1258     return a * b;
1259 }
1260 
1261 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1262 {
1263     return float32_is_zero(a.s) || float32_is_zero(b.s);
1264 }
1265 
1266 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1267 {
1268     return float64_is_zero(a.s) || float64_is_zero(b.s);
1269 }
1270 
1271 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1272 {
1273     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1274 
1275     return float32_set_sign(float32_zero, signbit);
1276 }
1277 
1278 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1279 {
1280     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1281 
1282     return float64_set_sign(float64_zero, signbit);
1283 }
1284 
1285 float32 QEMU_FLATTEN
1286 float32_mul(float32 a, float32 b, float_status *s)
1287 {
1288     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1289                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1290 }
1291 
1292 float64 QEMU_FLATTEN
1293 float64_mul(float64 a, float64 b, float_status *s)
1294 {
1295     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1296                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1297 }
1298 
1299 /*
1300  * Returns the result of multiplying the floating-point values `a' and
1301  * `b' then adding 'c', with no intermediate rounding step after the
1302  * multiplication. The operation is performed according to the
1303  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1304  * The flags argument allows the caller to select negation of the
1305  * addend, the intermediate product, or the final result. (The
1306  * difference between this and having the caller do a separate
1307  * negation is that negating externally will flip the sign bit on
1308  * NaNs.)
1309  */
1310 
1311 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1312                                 int flags, float_status *s)
1313 {
1314     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1315                     ((1 << float_class_inf) | (1 << float_class_zero));
1316     bool p_sign;
1317     bool sign_flip = flags & float_muladd_negate_result;
1318     FloatClass p_class;
1319     uint64_t hi, lo;
1320     int p_exp;
1321 
1322     /* It is implementation-defined whether the cases of (0,inf,qnan)
1323      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1324      * they return if they do), so we have to hand this information
1325      * off to the target-specific pick-a-NaN routine.
1326      */
1327     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1328         return pick_nan_muladd(a, b, c, inf_zero, s);
1329     }
1330 
1331     if (inf_zero) {
1332         s->float_exception_flags |= float_flag_invalid;
1333         return parts_default_nan(s);
1334     }
1335 
1336     if (flags & float_muladd_negate_c) {
1337         c.sign ^= 1;
1338     }
1339 
1340     p_sign = a.sign ^ b.sign;
1341 
1342     if (flags & float_muladd_negate_product) {
1343         p_sign ^= 1;
1344     }
1345 
1346     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1347         p_class = float_class_inf;
1348     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1349         p_class = float_class_zero;
1350     } else {
1351         p_class = float_class_normal;
1352     }
1353 
1354     if (c.cls == float_class_inf) {
1355         if (p_class == float_class_inf && p_sign != c.sign) {
1356             s->float_exception_flags |= float_flag_invalid;
1357             return parts_default_nan(s);
1358         } else {
1359             a.cls = float_class_inf;
1360             a.sign = c.sign ^ sign_flip;
1361             return a;
1362         }
1363     }
1364 
1365     if (p_class == float_class_inf) {
1366         a.cls = float_class_inf;
1367         a.sign = p_sign ^ sign_flip;
1368         return a;
1369     }
1370 
1371     if (p_class == float_class_zero) {
1372         if (c.cls == float_class_zero) {
1373             if (p_sign != c.sign) {
1374                 p_sign = s->float_rounding_mode == float_round_down;
1375             }
1376             c.sign = p_sign;
1377         } else if (flags & float_muladd_halve_result) {
1378             c.exp -= 1;
1379         }
1380         c.sign ^= sign_flip;
1381         return c;
1382     }
1383 
1384     /* a & b should be normals now... */
1385     assert(a.cls == float_class_normal &&
1386            b.cls == float_class_normal);
1387 
1388     p_exp = a.exp + b.exp;
1389 
1390     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1391      * result.
1392      */
1393     mul64To128(a.frac, b.frac, &hi, &lo);
1394     /* binary point now at bit 124 */
1395 
1396     /* check for overflow */
1397     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1398         shift128RightJamming(hi, lo, 1, &hi, &lo);
1399         p_exp += 1;
1400     }
1401 
1402     /* + add/sub */
1403     if (c.cls == float_class_zero) {
1404         /* move binary point back to 62 */
1405         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1406     } else {
1407         int exp_diff = p_exp - c.exp;
1408         if (p_sign == c.sign) {
1409             /* Addition */
1410             if (exp_diff <= 0) {
1411                 shift128RightJamming(hi, lo,
1412                                      DECOMPOSED_BINARY_POINT - exp_diff,
1413                                      &hi, &lo);
1414                 lo += c.frac;
1415                 p_exp = c.exp;
1416             } else {
1417                 uint64_t c_hi, c_lo;
1418                 /* shift c to the same binary point as the product (124) */
1419                 c_hi = c.frac >> 2;
1420                 c_lo = 0;
1421                 shift128RightJamming(c_hi, c_lo,
1422                                      exp_diff,
1423                                      &c_hi, &c_lo);
1424                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1425                 /* move binary point back to 62 */
1426                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1427             }
1428 
1429             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1430                 shift64RightJamming(lo, 1, &lo);
1431                 p_exp += 1;
1432             }
1433 
1434         } else {
1435             /* Subtraction */
1436             uint64_t c_hi, c_lo;
1437             /* make C binary point match product at bit 124 */
1438             c_hi = c.frac >> 2;
1439             c_lo = 0;
1440 
1441             if (exp_diff <= 0) {
1442                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1443                 if (exp_diff == 0
1444                     &&
1445                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1446                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1447                 } else {
1448                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1449                     p_sign ^= 1;
1450                     p_exp = c.exp;
1451                 }
1452             } else {
1453                 shift128RightJamming(c_hi, c_lo,
1454                                      exp_diff,
1455                                      &c_hi, &c_lo);
1456                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1457             }
1458 
1459             if (hi == 0 && lo == 0) {
1460                 a.cls = float_class_zero;
1461                 a.sign = s->float_rounding_mode == float_round_down;
1462                 a.sign ^= sign_flip;
1463                 return a;
1464             } else {
1465                 int shift;
1466                 if (hi != 0) {
1467                     shift = clz64(hi);
1468                 } else {
1469                     shift = clz64(lo) + 64;
1470                 }
1471                 /* Normalizing to a binary point of 124 is the
1472                    correct adjust for the exponent.  However since we're
1473                    shifting, we might as well put the binary point back
1474                    at 62 where we really want it.  Therefore shift as
1475                    if we're leaving 1 bit at the top of the word, but
1476                    adjust the exponent as if we're leaving 3 bits.  */
1477                 shift -= 1;
1478                 if (shift >= 64) {
1479                     lo = lo << (shift - 64);
1480                 } else {
1481                     hi = (hi << shift) | (lo >> (64 - shift));
1482                     lo = hi | ((lo << shift) != 0);
1483                 }
1484                 p_exp -= shift - 2;
1485             }
1486         }
1487     }
1488 
1489     if (flags & float_muladd_halve_result) {
1490         p_exp -= 1;
1491     }
1492 
1493     /* finally prepare our result */
1494     a.cls = float_class_normal;
1495     a.sign = p_sign ^ sign_flip;
1496     a.exp = p_exp;
1497     a.frac = lo;
1498 
1499     return a;
1500 }
1501 
1502 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1503                                                 int flags, float_status *status)
1504 {
1505     FloatParts pa = float16_unpack_canonical(a, status);
1506     FloatParts pb = float16_unpack_canonical(b, status);
1507     FloatParts pc = float16_unpack_canonical(c, status);
1508     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1509 
1510     return float16_round_pack_canonical(pr, status);
1511 }
1512 
1513 static float32 QEMU_SOFTFLOAT_ATTR
1514 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1515                 float_status *status)
1516 {
1517     FloatParts pa = float32_unpack_canonical(a, status);
1518     FloatParts pb = float32_unpack_canonical(b, status);
1519     FloatParts pc = float32_unpack_canonical(c, status);
1520     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1521 
1522     return float32_round_pack_canonical(pr, status);
1523 }
1524 
1525 static float64 QEMU_SOFTFLOAT_ATTR
1526 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1527                 float_status *status)
1528 {
1529     FloatParts pa = float64_unpack_canonical(a, status);
1530     FloatParts pb = float64_unpack_canonical(b, status);
1531     FloatParts pc = float64_unpack_canonical(c, status);
1532     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1533 
1534     return float64_round_pack_canonical(pr, status);
1535 }
1536 
1537 static bool force_soft_fma;
1538 
1539 float32 QEMU_FLATTEN
1540 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1541 {
1542     union_float32 ua, ub, uc, ur;
1543 
1544     ua.s = xa;
1545     ub.s = xb;
1546     uc.s = xc;
1547 
1548     if (unlikely(!can_use_fpu(s))) {
1549         goto soft;
1550     }
1551     if (unlikely(flags & float_muladd_halve_result)) {
1552         goto soft;
1553     }
1554 
1555     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1556     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1557         goto soft;
1558     }
1559 
1560     if (unlikely(force_soft_fma)) {
1561         goto soft;
1562     }
1563 
1564     /*
1565      * When (a || b) == 0, there's no need to check for under/over flow,
1566      * since we know the addend is (normal || 0) and the product is 0.
1567      */
1568     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1569         union_float32 up;
1570         bool prod_sign;
1571 
1572         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1573         prod_sign ^= !!(flags & float_muladd_negate_product);
1574         up.s = float32_set_sign(float32_zero, prod_sign);
1575 
1576         if (flags & float_muladd_negate_c) {
1577             uc.h = -uc.h;
1578         }
1579         ur.h = up.h + uc.h;
1580     } else {
1581         union_float32 ua_orig = ua;
1582         union_float32 uc_orig = uc;
1583 
1584         if (flags & float_muladd_negate_product) {
1585             ua.h = -ua.h;
1586         }
1587         if (flags & float_muladd_negate_c) {
1588             uc.h = -uc.h;
1589         }
1590 
1591         ur.h = fmaf(ua.h, ub.h, uc.h);
1592 
1593         if (unlikely(f32_is_inf(ur))) {
1594             s->float_exception_flags |= float_flag_overflow;
1595         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1596             ua = ua_orig;
1597             uc = uc_orig;
1598             goto soft;
1599         }
1600     }
1601     if (flags & float_muladd_negate_result) {
1602         return float32_chs(ur.s);
1603     }
1604     return ur.s;
1605 
1606  soft:
1607     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1608 }
1609 
1610 float64 QEMU_FLATTEN
1611 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1612 {
1613     union_float64 ua, ub, uc, ur;
1614 
1615     ua.s = xa;
1616     ub.s = xb;
1617     uc.s = xc;
1618 
1619     if (unlikely(!can_use_fpu(s))) {
1620         goto soft;
1621     }
1622     if (unlikely(flags & float_muladd_halve_result)) {
1623         goto soft;
1624     }
1625 
1626     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1627     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1628         goto soft;
1629     }
1630 
1631     if (unlikely(force_soft_fma)) {
1632         goto soft;
1633     }
1634 
1635     /*
1636      * When (a || b) == 0, there's no need to check for under/over flow,
1637      * since we know the addend is (normal || 0) and the product is 0.
1638      */
1639     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1640         union_float64 up;
1641         bool prod_sign;
1642 
1643         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1644         prod_sign ^= !!(flags & float_muladd_negate_product);
1645         up.s = float64_set_sign(float64_zero, prod_sign);
1646 
1647         if (flags & float_muladd_negate_c) {
1648             uc.h = -uc.h;
1649         }
1650         ur.h = up.h + uc.h;
1651     } else {
1652         union_float64 ua_orig = ua;
1653         union_float64 uc_orig = uc;
1654 
1655         if (flags & float_muladd_negate_product) {
1656             ua.h = -ua.h;
1657         }
1658         if (flags & float_muladd_negate_c) {
1659             uc.h = -uc.h;
1660         }
1661 
1662         ur.h = fma(ua.h, ub.h, uc.h);
1663 
1664         if (unlikely(f64_is_inf(ur))) {
1665             s->float_exception_flags |= float_flag_overflow;
1666         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1667             ua = ua_orig;
1668             uc = uc_orig;
1669             goto soft;
1670         }
1671     }
1672     if (flags & float_muladd_negate_result) {
1673         return float64_chs(ur.s);
1674     }
1675     return ur.s;
1676 
1677  soft:
1678     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1679 }
1680 
1681 /*
1682  * Returns the result of dividing the floating-point value `a' by the
1683  * corresponding value `b'. The operation is performed according to
1684  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1685  */
1686 
1687 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1688 {
1689     bool sign = a.sign ^ b.sign;
1690 
1691     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1692         uint64_t n0, n1, q, r;
1693         int exp = a.exp - b.exp;
1694 
1695         /*
1696          * We want a 2*N / N-bit division to produce exactly an N-bit
1697          * result, so that we do not lose any precision and so that we
1698          * do not have to renormalize afterward.  If A.frac < B.frac,
1699          * then division would produce an (N-1)-bit result; shift A left
1700          * by one to produce the an N-bit result, and decrement the
1701          * exponent to match.
1702          *
1703          * The udiv_qrnnd algorithm that we're using requires normalization,
1704          * i.e. the msb of the denominator must be set.  Since we know that
1705          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1706          * by one (more), and the remainder must be shifted right by one.
1707          */
1708         if (a.frac < b.frac) {
1709             exp -= 1;
1710             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1711         } else {
1712             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1713         }
1714         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1715 
1716         /*
1717          * Set lsb if there is a remainder, to set inexact.
1718          * As mentioned above, to find the actual value of the remainder we
1719          * would need to shift right, but (1) we are only concerned about
1720          * non-zero-ness, and (2) the remainder will always be even because
1721          * both inputs to the division primitive are even.
1722          */
1723         a.frac = q | (r != 0);
1724         a.sign = sign;
1725         a.exp = exp;
1726         return a;
1727     }
1728     /* handle all the NaN cases */
1729     if (is_nan(a.cls) || is_nan(b.cls)) {
1730         return pick_nan(a, b, s);
1731     }
1732     /* 0/0 or Inf/Inf */
1733     if (a.cls == b.cls
1734         &&
1735         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1736         s->float_exception_flags |= float_flag_invalid;
1737         return parts_default_nan(s);
1738     }
1739     /* Inf / x or 0 / x */
1740     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1741         a.sign = sign;
1742         return a;
1743     }
1744     /* Div 0 => Inf */
1745     if (b.cls == float_class_zero) {
1746         s->float_exception_flags |= float_flag_divbyzero;
1747         a.cls = float_class_inf;
1748         a.sign = sign;
1749         return a;
1750     }
1751     /* Div by Inf */
1752     if (b.cls == float_class_inf) {
1753         a.cls = float_class_zero;
1754         a.sign = sign;
1755         return a;
1756     }
1757     g_assert_not_reached();
1758 }
1759 
1760 float16 float16_div(float16 a, float16 b, float_status *status)
1761 {
1762     FloatParts pa = float16_unpack_canonical(a, status);
1763     FloatParts pb = float16_unpack_canonical(b, status);
1764     FloatParts pr = div_floats(pa, pb, status);
1765 
1766     return float16_round_pack_canonical(pr, status);
1767 }
1768 
1769 static float32 QEMU_SOFTFLOAT_ATTR
1770 soft_f32_div(float32 a, float32 b, float_status *status)
1771 {
1772     FloatParts pa = float32_unpack_canonical(a, status);
1773     FloatParts pb = float32_unpack_canonical(b, status);
1774     FloatParts pr = div_floats(pa, pb, status);
1775 
1776     return float32_round_pack_canonical(pr, status);
1777 }
1778 
1779 static float64 QEMU_SOFTFLOAT_ATTR
1780 soft_f64_div(float64 a, float64 b, float_status *status)
1781 {
1782     FloatParts pa = float64_unpack_canonical(a, status);
1783     FloatParts pb = float64_unpack_canonical(b, status);
1784     FloatParts pr = div_floats(pa, pb, status);
1785 
1786     return float64_round_pack_canonical(pr, status);
1787 }
1788 
1789 static float hard_f32_div(float a, float b)
1790 {
1791     return a / b;
1792 }
1793 
1794 static double hard_f64_div(double a, double b)
1795 {
1796     return a / b;
1797 }
1798 
1799 static bool f32_div_pre(union_float32 a, union_float32 b)
1800 {
1801     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1802         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1803                fpclassify(b.h) == FP_NORMAL;
1804     }
1805     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1806 }
1807 
1808 static bool f64_div_pre(union_float64 a, union_float64 b)
1809 {
1810     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1811         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1812                fpclassify(b.h) == FP_NORMAL;
1813     }
1814     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1815 }
1816 
1817 static bool f32_div_post(union_float32 a, union_float32 b)
1818 {
1819     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1820         return fpclassify(a.h) != FP_ZERO;
1821     }
1822     return !float32_is_zero(a.s);
1823 }
1824 
1825 static bool f64_div_post(union_float64 a, union_float64 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float64_is_zero(a.s);
1831 }
1832 
1833 float32 QEMU_FLATTEN
1834 float32_div(float32 a, float32 b, float_status *s)
1835 {
1836     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1837                         f32_div_pre, f32_div_post, NULL, NULL);
1838 }
1839 
1840 float64 QEMU_FLATTEN
1841 float64_div(float64 a, float64 b, float_status *s)
1842 {
1843     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1844                         f64_div_pre, f64_div_post, NULL, NULL);
1845 }
1846 
1847 /*
1848  * Float to Float conversions
1849  *
1850  * Returns the result of converting one float format to another. The
1851  * conversion is performed according to the IEC/IEEE Standard for
1852  * Binary Floating-Point Arithmetic.
1853  *
1854  * The float_to_float helper only needs to take care of raising
1855  * invalid exceptions and handling the conversion on NaNs.
1856  */
1857 
1858 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1859                                  float_status *s)
1860 {
1861     if (dstf->arm_althp) {
1862         switch (a.cls) {
1863         case float_class_qnan:
1864         case float_class_snan:
1865             /* There is no NaN in the destination format.  Raise Invalid
1866              * and return a zero with the sign of the input NaN.
1867              */
1868             s->float_exception_flags |= float_flag_invalid;
1869             a.cls = float_class_zero;
1870             a.frac = 0;
1871             a.exp = 0;
1872             break;
1873 
1874         case float_class_inf:
1875             /* There is no Inf in the destination format.  Raise Invalid
1876              * and return the maximum normal with the correct sign.
1877              */
1878             s->float_exception_flags |= float_flag_invalid;
1879             a.cls = float_class_normal;
1880             a.exp = dstf->exp_max;
1881             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1882             break;
1883 
1884         default:
1885             break;
1886         }
1887     } else if (is_nan(a.cls)) {
1888         if (is_snan(a.cls)) {
1889             s->float_exception_flags |= float_flag_invalid;
1890             a = parts_silence_nan(a, s);
1891         }
1892         if (s->default_nan_mode) {
1893             return parts_default_nan(s);
1894         }
1895     }
1896     return a;
1897 }
1898 
1899 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1900 {
1901     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1902     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1903     FloatParts pr = float_to_float(p, &float32_params, s);
1904     return float32_round_pack_canonical(pr, s);
1905 }
1906 
1907 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float64_params, s);
1912     return float64_round_pack_canonical(pr, s);
1913 }
1914 
1915 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float32_unpack_canonical(a, s);
1919     FloatParts pr = float_to_float(p, fmt16, s);
1920     return float16a_round_pack_canonical(pr, s, fmt16);
1921 }
1922 
1923 static float64 QEMU_SOFTFLOAT_ATTR
1924 soft_float32_to_float64(float32 a, float_status *s)
1925 {
1926     FloatParts p = float32_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, &float64_params, s);
1928     return float64_round_pack_canonical(pr, s);
1929 }
1930 
1931 float64 float32_to_float64(float32 a, float_status *s)
1932 {
1933     if (likely(float32_is_normal(a))) {
1934         /* Widening conversion can never produce inexact results.  */
1935         union_float32 uf;
1936         union_float64 ud;
1937         uf.s = a;
1938         ud.h = uf.h;
1939         return ud.s;
1940     } else if (float32_is_zero(a)) {
1941         return float64_set_sign(float64_zero, float32_is_neg(a));
1942     } else {
1943         return soft_float32_to_float64(a, s);
1944     }
1945 }
1946 
1947 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1948 {
1949     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1950     FloatParts p = float64_unpack_canonical(a, s);
1951     FloatParts pr = float_to_float(p, fmt16, s);
1952     return float16a_round_pack_canonical(pr, s, fmt16);
1953 }
1954 
1955 float32 float64_to_float32(float64 a, float_status *s)
1956 {
1957     FloatParts p = float64_unpack_canonical(a, s);
1958     FloatParts pr = float_to_float(p, &float32_params, s);
1959     return float32_round_pack_canonical(pr, s);
1960 }
1961 
1962 /*
1963  * Rounds the floating-point value `a' to an integer, and returns the
1964  * result as a floating-point value. The operation is performed
1965  * according to the IEC/IEEE Standard for Binary Floating-Point
1966  * Arithmetic.
1967  */
1968 
1969 static FloatParts round_to_int(FloatParts a, int rmode,
1970                                int scale, float_status *s)
1971 {
1972     switch (a.cls) {
1973     case float_class_qnan:
1974     case float_class_snan:
1975         return return_nan(a, s);
1976 
1977     case float_class_zero:
1978     case float_class_inf:
1979         /* already "integral" */
1980         break;
1981 
1982     case float_class_normal:
1983         scale = MIN(MAX(scale, -0x10000), 0x10000);
1984         a.exp += scale;
1985 
1986         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1987             /* already integral */
1988             break;
1989         }
1990         if (a.exp < 0) {
1991             bool one;
1992             /* all fractional */
1993             s->float_exception_flags |= float_flag_inexact;
1994             switch (rmode) {
1995             case float_round_nearest_even:
1996                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1997                 break;
1998             case float_round_ties_away:
1999                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
2000                 break;
2001             case float_round_to_zero:
2002                 one = false;
2003                 break;
2004             case float_round_up:
2005                 one = !a.sign;
2006                 break;
2007             case float_round_down:
2008                 one = a.sign;
2009                 break;
2010             case float_round_to_odd:
2011                 one = true;
2012                 break;
2013             default:
2014                 g_assert_not_reached();
2015             }
2016 
2017             if (one) {
2018                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2019                 a.exp = 0;
2020             } else {
2021                 a.cls = float_class_zero;
2022             }
2023         } else {
2024             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2025             uint64_t frac_lsbm1 = frac_lsb >> 1;
2026             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2027             uint64_t rnd_mask = rnd_even_mask >> 1;
2028             uint64_t inc;
2029 
2030             switch (rmode) {
2031             case float_round_nearest_even:
2032                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2033                 break;
2034             case float_round_ties_away:
2035                 inc = frac_lsbm1;
2036                 break;
2037             case float_round_to_zero:
2038                 inc = 0;
2039                 break;
2040             case float_round_up:
2041                 inc = a.sign ? 0 : rnd_mask;
2042                 break;
2043             case float_round_down:
2044                 inc = a.sign ? rnd_mask : 0;
2045                 break;
2046             case float_round_to_odd:
2047                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2048                 break;
2049             default:
2050                 g_assert_not_reached();
2051             }
2052 
2053             if (a.frac & rnd_mask) {
2054                 s->float_exception_flags |= float_flag_inexact;
2055                 a.frac += inc;
2056                 a.frac &= ~rnd_mask;
2057                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2058                     a.frac >>= 1;
2059                     a.exp++;
2060                 }
2061             }
2062         }
2063         break;
2064     default:
2065         g_assert_not_reached();
2066     }
2067     return a;
2068 }
2069 
2070 float16 float16_round_to_int(float16 a, float_status *s)
2071 {
2072     FloatParts pa = float16_unpack_canonical(a, s);
2073     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2074     return float16_round_pack_canonical(pr, s);
2075 }
2076 
2077 float32 float32_round_to_int(float32 a, float_status *s)
2078 {
2079     FloatParts pa = float32_unpack_canonical(a, s);
2080     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2081     return float32_round_pack_canonical(pr, s);
2082 }
2083 
2084 float64 float64_round_to_int(float64 a, float_status *s)
2085 {
2086     FloatParts pa = float64_unpack_canonical(a, s);
2087     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2088     return float64_round_pack_canonical(pr, s);
2089 }
2090 
2091 /*
2092  * Returns the result of converting the floating-point value `a' to
2093  * the two's complement integer format. The conversion is performed
2094  * according to the IEC/IEEE Standard for Binary Floating-Point
2095  * Arithmetic---which means in particular that the conversion is
2096  * rounded according to the current rounding mode. If `a' is a NaN,
2097  * the largest positive integer is returned. Otherwise, if the
2098  * conversion overflows, the largest integer with the same sign as `a'
2099  * is returned.
2100 */
2101 
2102 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2103                                      int64_t min, int64_t max,
2104                                      float_status *s)
2105 {
2106     uint64_t r;
2107     int orig_flags = get_float_exception_flags(s);
2108     FloatParts p = round_to_int(in, rmode, scale, s);
2109 
2110     switch (p.cls) {
2111     case float_class_snan:
2112     case float_class_qnan:
2113         s->float_exception_flags = orig_flags | float_flag_invalid;
2114         return max;
2115     case float_class_inf:
2116         s->float_exception_flags = orig_flags | float_flag_invalid;
2117         return p.sign ? min : max;
2118     case float_class_zero:
2119         return 0;
2120     case float_class_normal:
2121         if (p.exp < DECOMPOSED_BINARY_POINT) {
2122             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2123         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2124             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2125         } else {
2126             r = UINT64_MAX;
2127         }
2128         if (p.sign) {
2129             if (r <= -(uint64_t) min) {
2130                 return -r;
2131             } else {
2132                 s->float_exception_flags = orig_flags | float_flag_invalid;
2133                 return min;
2134             }
2135         } else {
2136             if (r <= max) {
2137                 return r;
2138             } else {
2139                 s->float_exception_flags = orig_flags | float_flag_invalid;
2140                 return max;
2141             }
2142         }
2143     default:
2144         g_assert_not_reached();
2145     }
2146 }
2147 
2148 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2149                                 float_status *s)
2150 {
2151     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2152                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2153 }
2154 
2155 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2156                                 float_status *s)
2157 {
2158     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2159                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2160 }
2161 
2162 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2163                                 float_status *s)
2164 {
2165     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2166                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2167 }
2168 
2169 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2170                                 float_status *s)
2171 {
2172     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2173                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2174 }
2175 
2176 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2177                                 float_status *s)
2178 {
2179     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2180                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2181 }
2182 
2183 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2184                                 float_status *s)
2185 {
2186     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2187                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2188 }
2189 
2190 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2191                                 float_status *s)
2192 {
2193     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2194                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2195 }
2196 
2197 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2198                                 float_status *s)
2199 {
2200     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2201                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2202 }
2203 
2204 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2205                                 float_status *s)
2206 {
2207     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2208                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2209 }
2210 
2211 int16_t float16_to_int16(float16 a, float_status *s)
2212 {
2213     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2214 }
2215 
2216 int32_t float16_to_int32(float16 a, float_status *s)
2217 {
2218     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2219 }
2220 
2221 int64_t float16_to_int64(float16 a, float_status *s)
2222 {
2223     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2224 }
2225 
2226 int16_t float32_to_int16(float32 a, float_status *s)
2227 {
2228     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2229 }
2230 
2231 int32_t float32_to_int32(float32 a, float_status *s)
2232 {
2233     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2234 }
2235 
2236 int64_t float32_to_int64(float32 a, float_status *s)
2237 {
2238     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2239 }
2240 
2241 int16_t float64_to_int16(float64 a, float_status *s)
2242 {
2243     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2244 }
2245 
2246 int32_t float64_to_int32(float64 a, float_status *s)
2247 {
2248     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2249 }
2250 
2251 int64_t float64_to_int64(float64 a, float_status *s)
2252 {
2253     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2254 }
2255 
2256 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2257 {
2258     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2259 }
2260 
2261 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2262 {
2263     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2264 }
2265 
2266 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2267 {
2268     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2269 }
2270 
2271 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2272 {
2273     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2274 }
2275 
2276 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2277 {
2278     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2279 }
2280 
2281 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2282 {
2283     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2284 }
2285 
2286 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2287 {
2288     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2289 }
2290 
2291 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2292 {
2293     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2294 }
2295 
2296 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2297 {
2298     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2299 }
2300 
2301 /*
2302  *  Returns the result of converting the floating-point value `a' to
2303  *  the unsigned integer format. The conversion is performed according
2304  *  to the IEC/IEEE Standard for Binary Floating-Point
2305  *  Arithmetic---which means in particular that the conversion is
2306  *  rounded according to the current rounding mode. If `a' is a NaN,
2307  *  the largest unsigned integer is returned. Otherwise, if the
2308  *  conversion overflows, the largest unsigned integer is returned. If
2309  *  the 'a' is negative, the result is rounded and zero is returned;
2310  *  values that do not round to zero will raise the inexact exception
2311  *  flag.
2312  */
2313 
2314 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2315                                        uint64_t max, float_status *s)
2316 {
2317     int orig_flags = get_float_exception_flags(s);
2318     FloatParts p = round_to_int(in, rmode, scale, s);
2319     uint64_t r;
2320 
2321     switch (p.cls) {
2322     case float_class_snan:
2323     case float_class_qnan:
2324         s->float_exception_flags = orig_flags | float_flag_invalid;
2325         return max;
2326     case float_class_inf:
2327         s->float_exception_flags = orig_flags | float_flag_invalid;
2328         return p.sign ? 0 : max;
2329     case float_class_zero:
2330         return 0;
2331     case float_class_normal:
2332         if (p.sign) {
2333             s->float_exception_flags = orig_flags | float_flag_invalid;
2334             return 0;
2335         }
2336 
2337         if (p.exp < DECOMPOSED_BINARY_POINT) {
2338             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2339         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2340             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2341         } else {
2342             s->float_exception_flags = orig_flags | float_flag_invalid;
2343             return max;
2344         }
2345 
2346         /* For uint64 this will never trip, but if p.exp is too large
2347          * to shift a decomposed fraction we shall have exited via the
2348          * 3rd leg above.
2349          */
2350         if (r > max) {
2351             s->float_exception_flags = orig_flags | float_flag_invalid;
2352             return max;
2353         }
2354         return r;
2355     default:
2356         g_assert_not_reached();
2357     }
2358 }
2359 
2360 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2361                                   float_status *s)
2362 {
2363     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2364                                   rmode, scale, UINT16_MAX, s);
2365 }
2366 
2367 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2368                                   float_status *s)
2369 {
2370     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2371                                   rmode, scale, UINT32_MAX, s);
2372 }
2373 
2374 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2375                                   float_status *s)
2376 {
2377     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2378                                   rmode, scale, UINT64_MAX, s);
2379 }
2380 
2381 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2382                                   float_status *s)
2383 {
2384     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2385                                   rmode, scale, UINT16_MAX, s);
2386 }
2387 
2388 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2389                                   float_status *s)
2390 {
2391     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2392                                   rmode, scale, UINT32_MAX, s);
2393 }
2394 
2395 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2396                                   float_status *s)
2397 {
2398     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2399                                   rmode, scale, UINT64_MAX, s);
2400 }
2401 
2402 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2403                                   float_status *s)
2404 {
2405     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2406                                   rmode, scale, UINT16_MAX, s);
2407 }
2408 
2409 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2410                                   float_status *s)
2411 {
2412     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2413                                   rmode, scale, UINT32_MAX, s);
2414 }
2415 
2416 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2417                                   float_status *s)
2418 {
2419     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2420                                   rmode, scale, UINT64_MAX, s);
2421 }
2422 
2423 uint16_t float16_to_uint16(float16 a, float_status *s)
2424 {
2425     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2426 }
2427 
2428 uint32_t float16_to_uint32(float16 a, float_status *s)
2429 {
2430     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2431 }
2432 
2433 uint64_t float16_to_uint64(float16 a, float_status *s)
2434 {
2435     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2436 }
2437 
2438 uint16_t float32_to_uint16(float32 a, float_status *s)
2439 {
2440     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2441 }
2442 
2443 uint32_t float32_to_uint32(float32 a, float_status *s)
2444 {
2445     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2446 }
2447 
2448 uint64_t float32_to_uint64(float32 a, float_status *s)
2449 {
2450     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2451 }
2452 
2453 uint16_t float64_to_uint16(float64 a, float_status *s)
2454 {
2455     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2456 }
2457 
2458 uint32_t float64_to_uint32(float64 a, float_status *s)
2459 {
2460     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2461 }
2462 
2463 uint64_t float64_to_uint64(float64 a, float_status *s)
2464 {
2465     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2466 }
2467 
2468 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2469 {
2470     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2471 }
2472 
2473 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2474 {
2475     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2476 }
2477 
2478 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2479 {
2480     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2481 }
2482 
2483 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2484 {
2485     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2486 }
2487 
2488 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2489 {
2490     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2491 }
2492 
2493 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2494 {
2495     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2496 }
2497 
2498 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2499 {
2500     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2501 }
2502 
2503 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2504 {
2505     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2506 }
2507 
2508 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2509 {
2510     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2511 }
2512 
2513 /*
2514  * Integer to float conversions
2515  *
2516  * Returns the result of converting the two's complement integer `a'
2517  * to the floating-point format. The conversion is performed according
2518  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2519  */
2520 
2521 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2522 {
2523     FloatParts r = { .sign = false };
2524 
2525     if (a == 0) {
2526         r.cls = float_class_zero;
2527     } else {
2528         uint64_t f = a;
2529         int shift;
2530 
2531         r.cls = float_class_normal;
2532         if (a < 0) {
2533             f = -f;
2534             r.sign = true;
2535         }
2536         shift = clz64(f) - 1;
2537         scale = MIN(MAX(scale, -0x10000), 0x10000);
2538 
2539         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2540         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2541     }
2542 
2543     return r;
2544 }
2545 
2546 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2547 {
2548     FloatParts pa = int_to_float(a, scale, status);
2549     return float16_round_pack_canonical(pa, status);
2550 }
2551 
2552 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2553 {
2554     return int64_to_float16_scalbn(a, scale, status);
2555 }
2556 
2557 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2558 {
2559     return int64_to_float16_scalbn(a, scale, status);
2560 }
2561 
2562 float16 int64_to_float16(int64_t a, float_status *status)
2563 {
2564     return int64_to_float16_scalbn(a, 0, status);
2565 }
2566 
2567 float16 int32_to_float16(int32_t a, float_status *status)
2568 {
2569     return int64_to_float16_scalbn(a, 0, status);
2570 }
2571 
2572 float16 int16_to_float16(int16_t a, float_status *status)
2573 {
2574     return int64_to_float16_scalbn(a, 0, status);
2575 }
2576 
2577 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2578 {
2579     FloatParts pa = int_to_float(a, scale, status);
2580     return float32_round_pack_canonical(pa, status);
2581 }
2582 
2583 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2584 {
2585     return int64_to_float32_scalbn(a, scale, status);
2586 }
2587 
2588 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2589 {
2590     return int64_to_float32_scalbn(a, scale, status);
2591 }
2592 
2593 float32 int64_to_float32(int64_t a, float_status *status)
2594 {
2595     return int64_to_float32_scalbn(a, 0, status);
2596 }
2597 
2598 float32 int32_to_float32(int32_t a, float_status *status)
2599 {
2600     return int64_to_float32_scalbn(a, 0, status);
2601 }
2602 
2603 float32 int16_to_float32(int16_t a, float_status *status)
2604 {
2605     return int64_to_float32_scalbn(a, 0, status);
2606 }
2607 
2608 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2609 {
2610     FloatParts pa = int_to_float(a, scale, status);
2611     return float64_round_pack_canonical(pa, status);
2612 }
2613 
2614 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2615 {
2616     return int64_to_float64_scalbn(a, scale, status);
2617 }
2618 
2619 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2620 {
2621     return int64_to_float64_scalbn(a, scale, status);
2622 }
2623 
2624 float64 int64_to_float64(int64_t a, float_status *status)
2625 {
2626     return int64_to_float64_scalbn(a, 0, status);
2627 }
2628 
2629 float64 int32_to_float64(int32_t a, float_status *status)
2630 {
2631     return int64_to_float64_scalbn(a, 0, status);
2632 }
2633 
2634 float64 int16_to_float64(int16_t a, float_status *status)
2635 {
2636     return int64_to_float64_scalbn(a, 0, status);
2637 }
2638 
2639 
2640 /*
2641  * Unsigned Integer to float conversions
2642  *
2643  * Returns the result of converting the unsigned integer `a' to the
2644  * floating-point format. The conversion is performed according to the
2645  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2646  */
2647 
2648 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2649 {
2650     FloatParts r = { .sign = false };
2651 
2652     if (a == 0) {
2653         r.cls = float_class_zero;
2654     } else {
2655         scale = MIN(MAX(scale, -0x10000), 0x10000);
2656         r.cls = float_class_normal;
2657         if ((int64_t)a < 0) {
2658             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2659             shift64RightJamming(a, 1, &a);
2660             r.frac = a;
2661         } else {
2662             int shift = clz64(a) - 1;
2663             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2664             r.frac = a << shift;
2665         }
2666     }
2667 
2668     return r;
2669 }
2670 
2671 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2672 {
2673     FloatParts pa = uint_to_float(a, scale, status);
2674     return float16_round_pack_canonical(pa, status);
2675 }
2676 
2677 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2678 {
2679     return uint64_to_float16_scalbn(a, scale, status);
2680 }
2681 
2682 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2683 {
2684     return uint64_to_float16_scalbn(a, scale, status);
2685 }
2686 
2687 float16 uint64_to_float16(uint64_t a, float_status *status)
2688 {
2689     return uint64_to_float16_scalbn(a, 0, status);
2690 }
2691 
2692 float16 uint32_to_float16(uint32_t a, float_status *status)
2693 {
2694     return uint64_to_float16_scalbn(a, 0, status);
2695 }
2696 
2697 float16 uint16_to_float16(uint16_t a, float_status *status)
2698 {
2699     return uint64_to_float16_scalbn(a, 0, status);
2700 }
2701 
2702 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2703 {
2704     FloatParts pa = uint_to_float(a, scale, status);
2705     return float32_round_pack_canonical(pa, status);
2706 }
2707 
2708 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2709 {
2710     return uint64_to_float32_scalbn(a, scale, status);
2711 }
2712 
2713 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2714 {
2715     return uint64_to_float32_scalbn(a, scale, status);
2716 }
2717 
2718 float32 uint64_to_float32(uint64_t a, float_status *status)
2719 {
2720     return uint64_to_float32_scalbn(a, 0, status);
2721 }
2722 
2723 float32 uint32_to_float32(uint32_t a, float_status *status)
2724 {
2725     return uint64_to_float32_scalbn(a, 0, status);
2726 }
2727 
2728 float32 uint16_to_float32(uint16_t a, float_status *status)
2729 {
2730     return uint64_to_float32_scalbn(a, 0, status);
2731 }
2732 
2733 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2734 {
2735     FloatParts pa = uint_to_float(a, scale, status);
2736     return float64_round_pack_canonical(pa, status);
2737 }
2738 
2739 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2740 {
2741     return uint64_to_float64_scalbn(a, scale, status);
2742 }
2743 
2744 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2745 {
2746     return uint64_to_float64_scalbn(a, scale, status);
2747 }
2748 
2749 float64 uint64_to_float64(uint64_t a, float_status *status)
2750 {
2751     return uint64_to_float64_scalbn(a, 0, status);
2752 }
2753 
2754 float64 uint32_to_float64(uint32_t a, float_status *status)
2755 {
2756     return uint64_to_float64_scalbn(a, 0, status);
2757 }
2758 
2759 float64 uint16_to_float64(uint16_t a, float_status *status)
2760 {
2761     return uint64_to_float64_scalbn(a, 0, status);
2762 }
2763 
2764 /* Float Min/Max */
2765 /* min() and max() functions. These can't be implemented as
2766  * 'compare and pick one input' because that would mishandle
2767  * NaNs and +0 vs -0.
2768  *
2769  * minnum() and maxnum() functions. These are similar to the min()
2770  * and max() functions but if one of the arguments is a QNaN and
2771  * the other is numerical then the numerical argument is returned.
2772  * SNaNs will get quietened before being returned.
2773  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2774  * and maxNum() operations. min() and max() are the typical min/max
2775  * semantics provided by many CPUs which predate that specification.
2776  *
2777  * minnummag() and maxnummag() functions correspond to minNumMag()
2778  * and minNumMag() from the IEEE-754 2008.
2779  */
2780 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2781                                 bool ieee, bool ismag, float_status *s)
2782 {
2783     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2784         if (ieee) {
2785             /* Takes two floating-point values `a' and `b', one of
2786              * which is a NaN, and returns the appropriate NaN
2787              * result. If either `a' or `b' is a signaling NaN,
2788              * the invalid exception is raised.
2789              */
2790             if (is_snan(a.cls) || is_snan(b.cls)) {
2791                 return pick_nan(a, b, s);
2792             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2793                 return b;
2794             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2795                 return a;
2796             }
2797         }
2798         return pick_nan(a, b, s);
2799     } else {
2800         int a_exp, b_exp;
2801 
2802         switch (a.cls) {
2803         case float_class_normal:
2804             a_exp = a.exp;
2805             break;
2806         case float_class_inf:
2807             a_exp = INT_MAX;
2808             break;
2809         case float_class_zero:
2810             a_exp = INT_MIN;
2811             break;
2812         default:
2813             g_assert_not_reached();
2814             break;
2815         }
2816         switch (b.cls) {
2817         case float_class_normal:
2818             b_exp = b.exp;
2819             break;
2820         case float_class_inf:
2821             b_exp = INT_MAX;
2822             break;
2823         case float_class_zero:
2824             b_exp = INT_MIN;
2825             break;
2826         default:
2827             g_assert_not_reached();
2828             break;
2829         }
2830 
2831         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2832             bool a_less = a_exp < b_exp;
2833             if (a_exp == b_exp) {
2834                 a_less = a.frac < b.frac;
2835             }
2836             return a_less ^ ismin ? b : a;
2837         }
2838 
2839         if (a.sign == b.sign) {
2840             bool a_less = a_exp < b_exp;
2841             if (a_exp == b_exp) {
2842                 a_less = a.frac < b.frac;
2843             }
2844             return a.sign ^ a_less ^ ismin ? b : a;
2845         } else {
2846             return a.sign ^ ismin ? b : a;
2847         }
2848     }
2849 }
2850 
2851 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2852 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2853                                      float_status *s)                   \
2854 {                                                                       \
2855     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2856     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2857     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2858                                                                         \
2859     return float ## sz ## _round_pack_canonical(pr, s);                 \
2860 }
2861 
2862 MINMAX(16, min, true, false, false)
2863 MINMAX(16, minnum, true, true, false)
2864 MINMAX(16, minnummag, true, true, true)
2865 MINMAX(16, max, false, false, false)
2866 MINMAX(16, maxnum, false, true, false)
2867 MINMAX(16, maxnummag, false, true, true)
2868 
2869 MINMAX(32, min, true, false, false)
2870 MINMAX(32, minnum, true, true, false)
2871 MINMAX(32, minnummag, true, true, true)
2872 MINMAX(32, max, false, false, false)
2873 MINMAX(32, maxnum, false, true, false)
2874 MINMAX(32, maxnummag, false, true, true)
2875 
2876 MINMAX(64, min, true, false, false)
2877 MINMAX(64, minnum, true, true, false)
2878 MINMAX(64, minnummag, true, true, true)
2879 MINMAX(64, max, false, false, false)
2880 MINMAX(64, maxnum, false, true, false)
2881 MINMAX(64, maxnummag, false, true, true)
2882 
2883 #undef MINMAX
2884 
2885 /* Floating point compare */
2886 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2887                           float_status *s)
2888 {
2889     if (is_nan(a.cls) || is_nan(b.cls)) {
2890         if (!is_quiet ||
2891             a.cls == float_class_snan ||
2892             b.cls == float_class_snan) {
2893             s->float_exception_flags |= float_flag_invalid;
2894         }
2895         return float_relation_unordered;
2896     }
2897 
2898     if (a.cls == float_class_zero) {
2899         if (b.cls == float_class_zero) {
2900             return float_relation_equal;
2901         }
2902         return b.sign ? float_relation_greater : float_relation_less;
2903     } else if (b.cls == float_class_zero) {
2904         return a.sign ? float_relation_less : float_relation_greater;
2905     }
2906 
2907     /* The only really important thing about infinity is its sign. If
2908      * both are infinities the sign marks the smallest of the two.
2909      */
2910     if (a.cls == float_class_inf) {
2911         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2912             return float_relation_equal;
2913         }
2914         return a.sign ? float_relation_less : float_relation_greater;
2915     } else if (b.cls == float_class_inf) {
2916         return b.sign ? float_relation_greater : float_relation_less;
2917     }
2918 
2919     if (a.sign != b.sign) {
2920         return a.sign ? float_relation_less : float_relation_greater;
2921     }
2922 
2923     if (a.exp == b.exp) {
2924         if (a.frac == b.frac) {
2925             return float_relation_equal;
2926         }
2927         if (a.sign) {
2928             return a.frac > b.frac ?
2929                 float_relation_less : float_relation_greater;
2930         } else {
2931             return a.frac > b.frac ?
2932                 float_relation_greater : float_relation_less;
2933         }
2934     } else {
2935         if (a.sign) {
2936             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2937         } else {
2938             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2939         }
2940     }
2941 }
2942 
2943 #define COMPARE(name, attr, sz)                                         \
2944 static int attr                                                         \
2945 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2946 {                                                                       \
2947     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2948     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2949     return compare_floats(pa, pb, is_quiet, s);                         \
2950 }
2951 
2952 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2953 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2954 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2955 
2956 #undef COMPARE
2957 
2958 int float16_compare(float16 a, float16 b, float_status *s)
2959 {
2960     return soft_f16_compare(a, b, false, s);
2961 }
2962 
2963 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2964 {
2965     return soft_f16_compare(a, b, true, s);
2966 }
2967 
2968 static int QEMU_FLATTEN
2969 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2970 {
2971     union_float32 ua, ub;
2972 
2973     ua.s = xa;
2974     ub.s = xb;
2975 
2976     if (QEMU_NO_HARDFLOAT) {
2977         goto soft;
2978     }
2979 
2980     float32_input_flush2(&ua.s, &ub.s, s);
2981     if (isgreaterequal(ua.h, ub.h)) {
2982         if (isgreater(ua.h, ub.h)) {
2983             return float_relation_greater;
2984         }
2985         return float_relation_equal;
2986     }
2987     if (likely(isless(ua.h, ub.h))) {
2988         return float_relation_less;
2989     }
2990     /* The only condition remaining is unordered.
2991      * Fall through to set flags.
2992      */
2993  soft:
2994     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2995 }
2996 
2997 int float32_compare(float32 a, float32 b, float_status *s)
2998 {
2999     return f32_compare(a, b, false, s);
3000 }
3001 
3002 int float32_compare_quiet(float32 a, float32 b, float_status *s)
3003 {
3004     return f32_compare(a, b, true, s);
3005 }
3006 
3007 static int QEMU_FLATTEN
3008 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3009 {
3010     union_float64 ua, ub;
3011 
3012     ua.s = xa;
3013     ub.s = xb;
3014 
3015     if (QEMU_NO_HARDFLOAT) {
3016         goto soft;
3017     }
3018 
3019     float64_input_flush2(&ua.s, &ub.s, s);
3020     if (isgreaterequal(ua.h, ub.h)) {
3021         if (isgreater(ua.h, ub.h)) {
3022             return float_relation_greater;
3023         }
3024         return float_relation_equal;
3025     }
3026     if (likely(isless(ua.h, ub.h))) {
3027         return float_relation_less;
3028     }
3029     /* The only condition remaining is unordered.
3030      * Fall through to set flags.
3031      */
3032  soft:
3033     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3034 }
3035 
3036 int float64_compare(float64 a, float64 b, float_status *s)
3037 {
3038     return f64_compare(a, b, false, s);
3039 }
3040 
3041 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3042 {
3043     return f64_compare(a, b, true, s);
3044 }
3045 
3046 /* Multiply A by 2 raised to the power N.  */
3047 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3048 {
3049     if (unlikely(is_nan(a.cls))) {
3050         return return_nan(a, s);
3051     }
3052     if (a.cls == float_class_normal) {
3053         /* The largest float type (even though not supported by FloatParts)
3054          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3055          * still allows rounding to infinity, without allowing overflow
3056          * within the int32_t that backs FloatParts.exp.
3057          */
3058         n = MIN(MAX(n, -0x10000), 0x10000);
3059         a.exp += n;
3060     }
3061     return a;
3062 }
3063 
3064 float16 float16_scalbn(float16 a, int n, float_status *status)
3065 {
3066     FloatParts pa = float16_unpack_canonical(a, status);
3067     FloatParts pr = scalbn_decomposed(pa, n, status);
3068     return float16_round_pack_canonical(pr, status);
3069 }
3070 
3071 float32 float32_scalbn(float32 a, int n, float_status *status)
3072 {
3073     FloatParts pa = float32_unpack_canonical(a, status);
3074     FloatParts pr = scalbn_decomposed(pa, n, status);
3075     return float32_round_pack_canonical(pr, status);
3076 }
3077 
3078 float64 float64_scalbn(float64 a, int n, float_status *status)
3079 {
3080     FloatParts pa = float64_unpack_canonical(a, status);
3081     FloatParts pr = scalbn_decomposed(pa, n, status);
3082     return float64_round_pack_canonical(pr, status);
3083 }
3084 
3085 /*
3086  * Square Root
3087  *
3088  * The old softfloat code did an approximation step before zeroing in
3089  * on the final result. However for simpleness we just compute the
3090  * square root by iterating down from the implicit bit to enough extra
3091  * bits to ensure we get a correctly rounded result.
3092  *
3093  * This does mean however the calculation is slower than before,
3094  * especially for 64 bit floats.
3095  */
3096 
3097 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3098 {
3099     uint64_t a_frac, r_frac, s_frac;
3100     int bit, last_bit;
3101 
3102     if (is_nan(a.cls)) {
3103         return return_nan(a, s);
3104     }
3105     if (a.cls == float_class_zero) {
3106         return a;  /* sqrt(+-0) = +-0 */
3107     }
3108     if (a.sign) {
3109         s->float_exception_flags |= float_flag_invalid;
3110         return parts_default_nan(s);
3111     }
3112     if (a.cls == float_class_inf) {
3113         return a;  /* sqrt(+inf) = +inf */
3114     }
3115 
3116     assert(a.cls == float_class_normal);
3117 
3118     /* We need two overflow bits at the top. Adding room for that is a
3119      * right shift. If the exponent is odd, we can discard the low bit
3120      * by multiplying the fraction by 2; that's a left shift. Combine
3121      * those and we shift right if the exponent is even.
3122      */
3123     a_frac = a.frac;
3124     if (!(a.exp & 1)) {
3125         a_frac >>= 1;
3126     }
3127     a.exp >>= 1;
3128 
3129     /* Bit-by-bit computation of sqrt.  */
3130     r_frac = 0;
3131     s_frac = 0;
3132 
3133     /* Iterate from implicit bit down to the 3 extra bits to compute a
3134      * properly rounded result. Remember we've inserted one more bit
3135      * at the top, so these positions are one less.
3136      */
3137     bit = DECOMPOSED_BINARY_POINT - 1;
3138     last_bit = MAX(p->frac_shift - 4, 0);
3139     do {
3140         uint64_t q = 1ULL << bit;
3141         uint64_t t_frac = s_frac + q;
3142         if (t_frac <= a_frac) {
3143             s_frac = t_frac + q;
3144             a_frac -= t_frac;
3145             r_frac += q;
3146         }
3147         a_frac <<= 1;
3148     } while (--bit >= last_bit);
3149 
3150     /* Undo the right shift done above. If there is any remaining
3151      * fraction, the result is inexact. Set the sticky bit.
3152      */
3153     a.frac = (r_frac << 1) + (a_frac != 0);
3154 
3155     return a;
3156 }
3157 
3158 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3159 {
3160     FloatParts pa = float16_unpack_canonical(a, status);
3161     FloatParts pr = sqrt_float(pa, status, &float16_params);
3162     return float16_round_pack_canonical(pr, status);
3163 }
3164 
3165 static float32 QEMU_SOFTFLOAT_ATTR
3166 soft_f32_sqrt(float32 a, float_status *status)
3167 {
3168     FloatParts pa = float32_unpack_canonical(a, status);
3169     FloatParts pr = sqrt_float(pa, status, &float32_params);
3170     return float32_round_pack_canonical(pr, status);
3171 }
3172 
3173 static float64 QEMU_SOFTFLOAT_ATTR
3174 soft_f64_sqrt(float64 a, float_status *status)
3175 {
3176     FloatParts pa = float64_unpack_canonical(a, status);
3177     FloatParts pr = sqrt_float(pa, status, &float64_params);
3178     return float64_round_pack_canonical(pr, status);
3179 }
3180 
3181 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3182 {
3183     union_float32 ua, ur;
3184 
3185     ua.s = xa;
3186     if (unlikely(!can_use_fpu(s))) {
3187         goto soft;
3188     }
3189 
3190     float32_input_flush1(&ua.s, s);
3191     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3192         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3193                        fpclassify(ua.h) == FP_ZERO) ||
3194                      signbit(ua.h))) {
3195             goto soft;
3196         }
3197     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3198                         float32_is_neg(ua.s))) {
3199         goto soft;
3200     }
3201     ur.h = sqrtf(ua.h);
3202     return ur.s;
3203 
3204  soft:
3205     return soft_f32_sqrt(ua.s, s);
3206 }
3207 
3208 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3209 {
3210     union_float64 ua, ur;
3211 
3212     ua.s = xa;
3213     if (unlikely(!can_use_fpu(s))) {
3214         goto soft;
3215     }
3216 
3217     float64_input_flush1(&ua.s, s);
3218     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3219         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3220                        fpclassify(ua.h) == FP_ZERO) ||
3221                      signbit(ua.h))) {
3222             goto soft;
3223         }
3224     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3225                         float64_is_neg(ua.s))) {
3226         goto soft;
3227     }
3228     ur.h = sqrt(ua.h);
3229     return ur.s;
3230 
3231  soft:
3232     return soft_f64_sqrt(ua.s, s);
3233 }
3234 
3235 /*----------------------------------------------------------------------------
3236 | The pattern for a default generated NaN.
3237 *----------------------------------------------------------------------------*/
3238 
3239 float16 float16_default_nan(float_status *status)
3240 {
3241     FloatParts p = parts_default_nan(status);
3242     p.frac >>= float16_params.frac_shift;
3243     return float16_pack_raw(p);
3244 }
3245 
3246 float32 float32_default_nan(float_status *status)
3247 {
3248     FloatParts p = parts_default_nan(status);
3249     p.frac >>= float32_params.frac_shift;
3250     return float32_pack_raw(p);
3251 }
3252 
3253 float64 float64_default_nan(float_status *status)
3254 {
3255     FloatParts p = parts_default_nan(status);
3256     p.frac >>= float64_params.frac_shift;
3257     return float64_pack_raw(p);
3258 }
3259 
3260 float128 float128_default_nan(float_status *status)
3261 {
3262     FloatParts p = parts_default_nan(status);
3263     float128 r;
3264 
3265     /* Extrapolate from the choices made by parts_default_nan to fill
3266      * in the quad-floating format.  If the low bit is set, assume we
3267      * want to set all non-snan bits.
3268      */
3269     r.low = -(p.frac & 1);
3270     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3271     r.high |= UINT64_C(0x7FFF000000000000);
3272     r.high |= (uint64_t)p.sign << 63;
3273 
3274     return r;
3275 }
3276 
3277 /*----------------------------------------------------------------------------
3278 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3279 *----------------------------------------------------------------------------*/
3280 
3281 float16 float16_silence_nan(float16 a, float_status *status)
3282 {
3283     FloatParts p = float16_unpack_raw(a);
3284     p.frac <<= float16_params.frac_shift;
3285     p = parts_silence_nan(p, status);
3286     p.frac >>= float16_params.frac_shift;
3287     return float16_pack_raw(p);
3288 }
3289 
3290 float32 float32_silence_nan(float32 a, float_status *status)
3291 {
3292     FloatParts p = float32_unpack_raw(a);
3293     p.frac <<= float32_params.frac_shift;
3294     p = parts_silence_nan(p, status);
3295     p.frac >>= float32_params.frac_shift;
3296     return float32_pack_raw(p);
3297 }
3298 
3299 float64 float64_silence_nan(float64 a, float_status *status)
3300 {
3301     FloatParts p = float64_unpack_raw(a);
3302     p.frac <<= float64_params.frac_shift;
3303     p = parts_silence_nan(p, status);
3304     p.frac >>= float64_params.frac_shift;
3305     return float64_pack_raw(p);
3306 }
3307 
3308 
3309 /*----------------------------------------------------------------------------
3310 | If `a' is denormal and we are in flush-to-zero mode then set the
3311 | input-denormal exception and return zero. Otherwise just return the value.
3312 *----------------------------------------------------------------------------*/
3313 
3314 static bool parts_squash_denormal(FloatParts p, float_status *status)
3315 {
3316     if (p.exp == 0 && p.frac != 0) {
3317         float_raise(float_flag_input_denormal, status);
3318         return true;
3319     }
3320 
3321     return false;
3322 }
3323 
3324 float16 float16_squash_input_denormal(float16 a, float_status *status)
3325 {
3326     if (status->flush_inputs_to_zero) {
3327         FloatParts p = float16_unpack_raw(a);
3328         if (parts_squash_denormal(p, status)) {
3329             return float16_set_sign(float16_zero, p.sign);
3330         }
3331     }
3332     return a;
3333 }
3334 
3335 float32 float32_squash_input_denormal(float32 a, float_status *status)
3336 {
3337     if (status->flush_inputs_to_zero) {
3338         FloatParts p = float32_unpack_raw(a);
3339         if (parts_squash_denormal(p, status)) {
3340             return float32_set_sign(float32_zero, p.sign);
3341         }
3342     }
3343     return a;
3344 }
3345 
3346 float64 float64_squash_input_denormal(float64 a, float_status *status)
3347 {
3348     if (status->flush_inputs_to_zero) {
3349         FloatParts p = float64_unpack_raw(a);
3350         if (parts_squash_denormal(p, status)) {
3351             return float64_set_sign(float64_zero, p.sign);
3352         }
3353     }
3354     return a;
3355 }
3356 
3357 /*----------------------------------------------------------------------------
3358 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3359 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3360 | input.  If `zSign' is 1, the input is negated before being converted to an
3361 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3362 | is simply rounded to an integer, with the inexact exception raised if the
3363 | input cannot be represented exactly as an integer.  However, if the fixed-
3364 | point input is too large, the invalid exception is raised and the largest
3365 | positive or negative integer is returned.
3366 *----------------------------------------------------------------------------*/
3367 
3368 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3369 {
3370     int8_t roundingMode;
3371     flag roundNearestEven;
3372     int8_t roundIncrement, roundBits;
3373     int32_t z;
3374 
3375     roundingMode = status->float_rounding_mode;
3376     roundNearestEven = ( roundingMode == float_round_nearest_even );
3377     switch (roundingMode) {
3378     case float_round_nearest_even:
3379     case float_round_ties_away:
3380         roundIncrement = 0x40;
3381         break;
3382     case float_round_to_zero:
3383         roundIncrement = 0;
3384         break;
3385     case float_round_up:
3386         roundIncrement = zSign ? 0 : 0x7f;
3387         break;
3388     case float_round_down:
3389         roundIncrement = zSign ? 0x7f : 0;
3390         break;
3391     case float_round_to_odd:
3392         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3393         break;
3394     default:
3395         abort();
3396     }
3397     roundBits = absZ & 0x7F;
3398     absZ = ( absZ + roundIncrement )>>7;
3399     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3400     z = absZ;
3401     if ( zSign ) z = - z;
3402     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3403         float_raise(float_flag_invalid, status);
3404         return zSign ? INT32_MIN : INT32_MAX;
3405     }
3406     if (roundBits) {
3407         status->float_exception_flags |= float_flag_inexact;
3408     }
3409     return z;
3410 
3411 }
3412 
3413 /*----------------------------------------------------------------------------
3414 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3415 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3416 | and returns the properly rounded 64-bit integer corresponding to the input.
3417 | If `zSign' is 1, the input is negated before being converted to an integer.
3418 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3419 | the inexact exception raised if the input cannot be represented exactly as
3420 | an integer.  However, if the fixed-point input is too large, the invalid
3421 | exception is raised and the largest positive or negative integer is
3422 | returned.
3423 *----------------------------------------------------------------------------*/
3424 
3425 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3426                                float_status *status)
3427 {
3428     int8_t roundingMode;
3429     flag roundNearestEven, increment;
3430     int64_t z;
3431 
3432     roundingMode = status->float_rounding_mode;
3433     roundNearestEven = ( roundingMode == float_round_nearest_even );
3434     switch (roundingMode) {
3435     case float_round_nearest_even:
3436     case float_round_ties_away:
3437         increment = ((int64_t) absZ1 < 0);
3438         break;
3439     case float_round_to_zero:
3440         increment = 0;
3441         break;
3442     case float_round_up:
3443         increment = !zSign && absZ1;
3444         break;
3445     case float_round_down:
3446         increment = zSign && absZ1;
3447         break;
3448     case float_round_to_odd:
3449         increment = !(absZ0 & 1) && absZ1;
3450         break;
3451     default:
3452         abort();
3453     }
3454     if ( increment ) {
3455         ++absZ0;
3456         if ( absZ0 == 0 ) goto overflow;
3457         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3458     }
3459     z = absZ0;
3460     if ( zSign ) z = - z;
3461     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3462  overflow:
3463         float_raise(float_flag_invalid, status);
3464         return zSign ? INT64_MIN : INT64_MAX;
3465     }
3466     if (absZ1) {
3467         status->float_exception_flags |= float_flag_inexact;
3468     }
3469     return z;
3470 
3471 }
3472 
3473 /*----------------------------------------------------------------------------
3474 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3475 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3476 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3477 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3478 | with the inexact exception raised if the input cannot be represented exactly
3479 | as an integer.  However, if the fixed-point input is too large, the invalid
3480 | exception is raised and the largest unsigned integer is returned.
3481 *----------------------------------------------------------------------------*/
3482 
3483 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3484                                 uint64_t absZ1, float_status *status)
3485 {
3486     int8_t roundingMode;
3487     flag roundNearestEven, increment;
3488 
3489     roundingMode = status->float_rounding_mode;
3490     roundNearestEven = (roundingMode == float_round_nearest_even);
3491     switch (roundingMode) {
3492     case float_round_nearest_even:
3493     case float_round_ties_away:
3494         increment = ((int64_t)absZ1 < 0);
3495         break;
3496     case float_round_to_zero:
3497         increment = 0;
3498         break;
3499     case float_round_up:
3500         increment = !zSign && absZ1;
3501         break;
3502     case float_round_down:
3503         increment = zSign && absZ1;
3504         break;
3505     case float_round_to_odd:
3506         increment = !(absZ0 & 1) && absZ1;
3507         break;
3508     default:
3509         abort();
3510     }
3511     if (increment) {
3512         ++absZ0;
3513         if (absZ0 == 0) {
3514             float_raise(float_flag_invalid, status);
3515             return UINT64_MAX;
3516         }
3517         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3518     }
3519 
3520     if (zSign && absZ0) {
3521         float_raise(float_flag_invalid, status);
3522         return 0;
3523     }
3524 
3525     if (absZ1) {
3526         status->float_exception_flags |= float_flag_inexact;
3527     }
3528     return absZ0;
3529 }
3530 
3531 /*----------------------------------------------------------------------------
3532 | Normalizes the subnormal single-precision floating-point value represented
3533 | by the denormalized significand `aSig'.  The normalized exponent and
3534 | significand are stored at the locations pointed to by `zExpPtr' and
3535 | `zSigPtr', respectively.
3536 *----------------------------------------------------------------------------*/
3537 
3538 static void
3539  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3540 {
3541     int8_t shiftCount;
3542 
3543     shiftCount = clz32(aSig) - 8;
3544     *zSigPtr = aSig<<shiftCount;
3545     *zExpPtr = 1 - shiftCount;
3546 
3547 }
3548 
3549 /*----------------------------------------------------------------------------
3550 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3551 | and significand `zSig', and returns the proper single-precision floating-
3552 | point value corresponding to the abstract input.  Ordinarily, the abstract
3553 | value is simply rounded and packed into the single-precision format, with
3554 | the inexact exception raised if the abstract input cannot be represented
3555 | exactly.  However, if the abstract value is too large, the overflow and
3556 | inexact exceptions are raised and an infinity or maximal finite value is
3557 | returned.  If the abstract value is too small, the input value is rounded to
3558 | a subnormal number, and the underflow and inexact exceptions are raised if
3559 | the abstract input cannot be represented exactly as a subnormal single-
3560 | precision floating-point number.
3561 |     The input significand `zSig' has its binary point between bits 30
3562 | and 29, which is 7 bits to the left of the usual location.  This shifted
3563 | significand must be normalized or smaller.  If `zSig' is not normalized,
3564 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3565 | and it must not require rounding.  In the usual case that `zSig' is
3566 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3567 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3568 | Binary Floating-Point Arithmetic.
3569 *----------------------------------------------------------------------------*/
3570 
3571 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3572                                    float_status *status)
3573 {
3574     int8_t roundingMode;
3575     flag roundNearestEven;
3576     int8_t roundIncrement, roundBits;
3577     flag isTiny;
3578 
3579     roundingMode = status->float_rounding_mode;
3580     roundNearestEven = ( roundingMode == float_round_nearest_even );
3581     switch (roundingMode) {
3582     case float_round_nearest_even:
3583     case float_round_ties_away:
3584         roundIncrement = 0x40;
3585         break;
3586     case float_round_to_zero:
3587         roundIncrement = 0;
3588         break;
3589     case float_round_up:
3590         roundIncrement = zSign ? 0 : 0x7f;
3591         break;
3592     case float_round_down:
3593         roundIncrement = zSign ? 0x7f : 0;
3594         break;
3595     case float_round_to_odd:
3596         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3597         break;
3598     default:
3599         abort();
3600         break;
3601     }
3602     roundBits = zSig & 0x7F;
3603     if ( 0xFD <= (uint16_t) zExp ) {
3604         if (    ( 0xFD < zExp )
3605              || (    ( zExp == 0xFD )
3606                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3607            ) {
3608             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3609                                    roundIncrement != 0;
3610             float_raise(float_flag_overflow | float_flag_inexact, status);
3611             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3612         }
3613         if ( zExp < 0 ) {
3614             if (status->flush_to_zero) {
3615                 float_raise(float_flag_output_denormal, status);
3616                 return packFloat32(zSign, 0, 0);
3617             }
3618             isTiny =
3619                 (status->float_detect_tininess
3620                  == float_tininess_before_rounding)
3621                 || ( zExp < -1 )
3622                 || ( zSig + roundIncrement < 0x80000000 );
3623             shift32RightJamming( zSig, - zExp, &zSig );
3624             zExp = 0;
3625             roundBits = zSig & 0x7F;
3626             if (isTiny && roundBits) {
3627                 float_raise(float_flag_underflow, status);
3628             }
3629             if (roundingMode == float_round_to_odd) {
3630                 /*
3631                  * For round-to-odd case, the roundIncrement depends on
3632                  * zSig which just changed.
3633                  */
3634                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3635             }
3636         }
3637     }
3638     if (roundBits) {
3639         status->float_exception_flags |= float_flag_inexact;
3640     }
3641     zSig = ( zSig + roundIncrement )>>7;
3642     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3643     if ( zSig == 0 ) zExp = 0;
3644     return packFloat32( zSign, zExp, zSig );
3645 
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3650 | and significand `zSig', and returns the proper single-precision floating-
3651 | point value corresponding to the abstract input.  This routine is just like
3652 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3653 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3654 | floating-point exponent.
3655 *----------------------------------------------------------------------------*/
3656 
3657 static float32
3658  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3659                               float_status *status)
3660 {
3661     int8_t shiftCount;
3662 
3663     shiftCount = clz32(zSig) - 1;
3664     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3665                                status);
3666 
3667 }
3668 
3669 /*----------------------------------------------------------------------------
3670 | Normalizes the subnormal double-precision floating-point value represented
3671 | by the denormalized significand `aSig'.  The normalized exponent and
3672 | significand are stored at the locations pointed to by `zExpPtr' and
3673 | `zSigPtr', respectively.
3674 *----------------------------------------------------------------------------*/
3675 
3676 static void
3677  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3678 {
3679     int8_t shiftCount;
3680 
3681     shiftCount = clz64(aSig) - 11;
3682     *zSigPtr = aSig<<shiftCount;
3683     *zExpPtr = 1 - shiftCount;
3684 
3685 }
3686 
3687 /*----------------------------------------------------------------------------
3688 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3689 | double-precision floating-point value, returning the result.  After being
3690 | shifted into the proper positions, the three fields are simply added
3691 | together to form the result.  This means that any integer portion of `zSig'
3692 | will be added into the exponent.  Since a properly normalized significand
3693 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3694 | than the desired result exponent whenever `zSig' is a complete, normalized
3695 | significand.
3696 *----------------------------------------------------------------------------*/
3697 
3698 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3699 {
3700 
3701     return make_float64(
3702         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3703 
3704 }
3705 
3706 /*----------------------------------------------------------------------------
3707 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3708 | and significand `zSig', and returns the proper double-precision floating-
3709 | point value corresponding to the abstract input.  Ordinarily, the abstract
3710 | value is simply rounded and packed into the double-precision format, with
3711 | the inexact exception raised if the abstract input cannot be represented
3712 | exactly.  However, if the abstract value is too large, the overflow and
3713 | inexact exceptions are raised and an infinity or maximal finite value is
3714 | returned.  If the abstract value is too small, the input value is rounded to
3715 | a subnormal number, and the underflow and inexact exceptions are raised if
3716 | the abstract input cannot be represented exactly as a subnormal double-
3717 | precision floating-point number.
3718 |     The input significand `zSig' has its binary point between bits 62
3719 | and 61, which is 10 bits to the left of the usual location.  This shifted
3720 | significand must be normalized or smaller.  If `zSig' is not normalized,
3721 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3722 | and it must not require rounding.  In the usual case that `zSig' is
3723 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3724 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3725 | Binary Floating-Point Arithmetic.
3726 *----------------------------------------------------------------------------*/
3727 
3728 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3729                                    float_status *status)
3730 {
3731     int8_t roundingMode;
3732     flag roundNearestEven;
3733     int roundIncrement, roundBits;
3734     flag isTiny;
3735 
3736     roundingMode = status->float_rounding_mode;
3737     roundNearestEven = ( roundingMode == float_round_nearest_even );
3738     switch (roundingMode) {
3739     case float_round_nearest_even:
3740     case float_round_ties_away:
3741         roundIncrement = 0x200;
3742         break;
3743     case float_round_to_zero:
3744         roundIncrement = 0;
3745         break;
3746     case float_round_up:
3747         roundIncrement = zSign ? 0 : 0x3ff;
3748         break;
3749     case float_round_down:
3750         roundIncrement = zSign ? 0x3ff : 0;
3751         break;
3752     case float_round_to_odd:
3753         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3754         break;
3755     default:
3756         abort();
3757     }
3758     roundBits = zSig & 0x3FF;
3759     if ( 0x7FD <= (uint16_t) zExp ) {
3760         if (    ( 0x7FD < zExp )
3761              || (    ( zExp == 0x7FD )
3762                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3763            ) {
3764             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3765                                    roundIncrement != 0;
3766             float_raise(float_flag_overflow | float_flag_inexact, status);
3767             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3768         }
3769         if ( zExp < 0 ) {
3770             if (status->flush_to_zero) {
3771                 float_raise(float_flag_output_denormal, status);
3772                 return packFloat64(zSign, 0, 0);
3773             }
3774             isTiny =
3775                    (status->float_detect_tininess
3776                     == float_tininess_before_rounding)
3777                 || ( zExp < -1 )
3778                 || ( zSig + roundIncrement < UINT64_C(0x8000000000000000) );
3779             shift64RightJamming( zSig, - zExp, &zSig );
3780             zExp = 0;
3781             roundBits = zSig & 0x3FF;
3782             if (isTiny && roundBits) {
3783                 float_raise(float_flag_underflow, status);
3784             }
3785             if (roundingMode == float_round_to_odd) {
3786                 /*
3787                  * For round-to-odd case, the roundIncrement depends on
3788                  * zSig which just changed.
3789                  */
3790                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3791             }
3792         }
3793     }
3794     if (roundBits) {
3795         status->float_exception_flags |= float_flag_inexact;
3796     }
3797     zSig = ( zSig + roundIncrement )>>10;
3798     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3799     if ( zSig == 0 ) zExp = 0;
3800     return packFloat64( zSign, zExp, zSig );
3801 
3802 }
3803 
3804 /*----------------------------------------------------------------------------
3805 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3806 | and significand `zSig', and returns the proper double-precision floating-
3807 | point value corresponding to the abstract input.  This routine is just like
3808 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3809 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3810 | floating-point exponent.
3811 *----------------------------------------------------------------------------*/
3812 
3813 static float64
3814  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3815                               float_status *status)
3816 {
3817     int8_t shiftCount;
3818 
3819     shiftCount = clz64(zSig) - 1;
3820     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3821                                status);
3822 
3823 }
3824 
3825 /*----------------------------------------------------------------------------
3826 | Normalizes the subnormal extended double-precision floating-point value
3827 | represented by the denormalized significand `aSig'.  The normalized exponent
3828 | and significand are stored at the locations pointed to by `zExpPtr' and
3829 | `zSigPtr', respectively.
3830 *----------------------------------------------------------------------------*/
3831 
3832 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3833                                 uint64_t *zSigPtr)
3834 {
3835     int8_t shiftCount;
3836 
3837     shiftCount = clz64(aSig);
3838     *zSigPtr = aSig<<shiftCount;
3839     *zExpPtr = 1 - shiftCount;
3840 }
3841 
3842 /*----------------------------------------------------------------------------
3843 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3844 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3845 | and returns the proper extended double-precision floating-point value
3846 | corresponding to the abstract input.  Ordinarily, the abstract value is
3847 | rounded and packed into the extended double-precision format, with the
3848 | inexact exception raised if the abstract input cannot be represented
3849 | exactly.  However, if the abstract value is too large, the overflow and
3850 | inexact exceptions are raised and an infinity or maximal finite value is
3851 | returned.  If the abstract value is too small, the input value is rounded to
3852 | a subnormal number, and the underflow and inexact exceptions are raised if
3853 | the abstract input cannot be represented exactly as a subnormal extended
3854 | double-precision floating-point number.
3855 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3856 | number of bits as single or double precision, respectively.  Otherwise, the
3857 | result is rounded to the full precision of the extended double-precision
3858 | format.
3859 |     The input significand must be normalized or smaller.  If the input
3860 | significand is not normalized, `zExp' must be 0; in that case, the result
3861 | returned is a subnormal number, and it must not require rounding.  The
3862 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3863 | Floating-Point Arithmetic.
3864 *----------------------------------------------------------------------------*/
3865 
3866 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3867                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3868                               float_status *status)
3869 {
3870     int8_t roundingMode;
3871     flag roundNearestEven, increment, isTiny;
3872     int64_t roundIncrement, roundMask, roundBits;
3873 
3874     roundingMode = status->float_rounding_mode;
3875     roundNearestEven = ( roundingMode == float_round_nearest_even );
3876     if ( roundingPrecision == 80 ) goto precision80;
3877     if ( roundingPrecision == 64 ) {
3878         roundIncrement = UINT64_C(0x0000000000000400);
3879         roundMask = UINT64_C(0x00000000000007FF);
3880     }
3881     else if ( roundingPrecision == 32 ) {
3882         roundIncrement = UINT64_C(0x0000008000000000);
3883         roundMask = UINT64_C(0x000000FFFFFFFFFF);
3884     }
3885     else {
3886         goto precision80;
3887     }
3888     zSig0 |= ( zSig1 != 0 );
3889     switch (roundingMode) {
3890     case float_round_nearest_even:
3891     case float_round_ties_away:
3892         break;
3893     case float_round_to_zero:
3894         roundIncrement = 0;
3895         break;
3896     case float_round_up:
3897         roundIncrement = zSign ? 0 : roundMask;
3898         break;
3899     case float_round_down:
3900         roundIncrement = zSign ? roundMask : 0;
3901         break;
3902     default:
3903         abort();
3904     }
3905     roundBits = zSig0 & roundMask;
3906     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3907         if (    ( 0x7FFE < zExp )
3908              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3909            ) {
3910             goto overflow;
3911         }
3912         if ( zExp <= 0 ) {
3913             if (status->flush_to_zero) {
3914                 float_raise(float_flag_output_denormal, status);
3915                 return packFloatx80(zSign, 0, 0);
3916             }
3917             isTiny =
3918                    (status->float_detect_tininess
3919                     == float_tininess_before_rounding)
3920                 || ( zExp < 0 )
3921                 || ( zSig0 <= zSig0 + roundIncrement );
3922             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3923             zExp = 0;
3924             roundBits = zSig0 & roundMask;
3925             if (isTiny && roundBits) {
3926                 float_raise(float_flag_underflow, status);
3927             }
3928             if (roundBits) {
3929                 status->float_exception_flags |= float_flag_inexact;
3930             }
3931             zSig0 += roundIncrement;
3932             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3933             roundIncrement = roundMask + 1;
3934             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3935                 roundMask |= roundIncrement;
3936             }
3937             zSig0 &= ~ roundMask;
3938             return packFloatx80( zSign, zExp, zSig0 );
3939         }
3940     }
3941     if (roundBits) {
3942         status->float_exception_flags |= float_flag_inexact;
3943     }
3944     zSig0 += roundIncrement;
3945     if ( zSig0 < roundIncrement ) {
3946         ++zExp;
3947         zSig0 = UINT64_C(0x8000000000000000);
3948     }
3949     roundIncrement = roundMask + 1;
3950     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3951         roundMask |= roundIncrement;
3952     }
3953     zSig0 &= ~ roundMask;
3954     if ( zSig0 == 0 ) zExp = 0;
3955     return packFloatx80( zSign, zExp, zSig0 );
3956  precision80:
3957     switch (roundingMode) {
3958     case float_round_nearest_even:
3959     case float_round_ties_away:
3960         increment = ((int64_t)zSig1 < 0);
3961         break;
3962     case float_round_to_zero:
3963         increment = 0;
3964         break;
3965     case float_round_up:
3966         increment = !zSign && zSig1;
3967         break;
3968     case float_round_down:
3969         increment = zSign && zSig1;
3970         break;
3971     default:
3972         abort();
3973     }
3974     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3975         if (    ( 0x7FFE < zExp )
3976              || (    ( zExp == 0x7FFE )
3977                   && ( zSig0 == UINT64_C(0xFFFFFFFFFFFFFFFF) )
3978                   && increment
3979                 )
3980            ) {
3981             roundMask = 0;
3982  overflow:
3983             float_raise(float_flag_overflow | float_flag_inexact, status);
3984             if (    ( roundingMode == float_round_to_zero )
3985                  || ( zSign && ( roundingMode == float_round_up ) )
3986                  || ( ! zSign && ( roundingMode == float_round_down ) )
3987                ) {
3988                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3989             }
3990             return packFloatx80(zSign,
3991                                 floatx80_infinity_high,
3992                                 floatx80_infinity_low);
3993         }
3994         if ( zExp <= 0 ) {
3995             isTiny =
3996                    (status->float_detect_tininess
3997                     == float_tininess_before_rounding)
3998                 || ( zExp < 0 )
3999                 || ! increment
4000                 || ( zSig0 < UINT64_C(0xFFFFFFFFFFFFFFFF) );
4001             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
4002             zExp = 0;
4003             if (isTiny && zSig1) {
4004                 float_raise(float_flag_underflow, status);
4005             }
4006             if (zSig1) {
4007                 status->float_exception_flags |= float_flag_inexact;
4008             }
4009             switch (roundingMode) {
4010             case float_round_nearest_even:
4011             case float_round_ties_away:
4012                 increment = ((int64_t)zSig1 < 0);
4013                 break;
4014             case float_round_to_zero:
4015                 increment = 0;
4016                 break;
4017             case float_round_up:
4018                 increment = !zSign && zSig1;
4019                 break;
4020             case float_round_down:
4021                 increment = zSign && zSig1;
4022                 break;
4023             default:
4024                 abort();
4025             }
4026             if ( increment ) {
4027                 ++zSig0;
4028                 zSig0 &=
4029                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4030                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4031             }
4032             return packFloatx80( zSign, zExp, zSig0 );
4033         }
4034     }
4035     if (zSig1) {
4036         status->float_exception_flags |= float_flag_inexact;
4037     }
4038     if ( increment ) {
4039         ++zSig0;
4040         if ( zSig0 == 0 ) {
4041             ++zExp;
4042             zSig0 = UINT64_C(0x8000000000000000);
4043         }
4044         else {
4045             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4046         }
4047     }
4048     else {
4049         if ( zSig0 == 0 ) zExp = 0;
4050     }
4051     return packFloatx80( zSign, zExp, zSig0 );
4052 
4053 }
4054 
4055 /*----------------------------------------------------------------------------
4056 | Takes an abstract floating-point value having sign `zSign', exponent
4057 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4058 | and returns the proper extended double-precision floating-point value
4059 | corresponding to the abstract input.  This routine is just like
4060 | `roundAndPackFloatx80' except that the input significand does not have to be
4061 | normalized.
4062 *----------------------------------------------------------------------------*/
4063 
4064 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4065                                        flag zSign, int32_t zExp,
4066                                        uint64_t zSig0, uint64_t zSig1,
4067                                        float_status *status)
4068 {
4069     int8_t shiftCount;
4070 
4071     if ( zSig0 == 0 ) {
4072         zSig0 = zSig1;
4073         zSig1 = 0;
4074         zExp -= 64;
4075     }
4076     shiftCount = clz64(zSig0);
4077     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4078     zExp -= shiftCount;
4079     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4080                                 zSig0, zSig1, status);
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Returns the least-significant 64 fraction bits of the quadruple-precision
4086 | floating-point value `a'.
4087 *----------------------------------------------------------------------------*/
4088 
4089 static inline uint64_t extractFloat128Frac1( float128 a )
4090 {
4091 
4092     return a.low;
4093 
4094 }
4095 
4096 /*----------------------------------------------------------------------------
4097 | Returns the most-significant 48 fraction bits of the quadruple-precision
4098 | floating-point value `a'.
4099 *----------------------------------------------------------------------------*/
4100 
4101 static inline uint64_t extractFloat128Frac0( float128 a )
4102 {
4103 
4104     return a.high & UINT64_C(0x0000FFFFFFFFFFFF);
4105 
4106 }
4107 
4108 /*----------------------------------------------------------------------------
4109 | Returns the exponent bits of the quadruple-precision floating-point value
4110 | `a'.
4111 *----------------------------------------------------------------------------*/
4112 
4113 static inline int32_t extractFloat128Exp( float128 a )
4114 {
4115 
4116     return ( a.high>>48 ) & 0x7FFF;
4117 
4118 }
4119 
4120 /*----------------------------------------------------------------------------
4121 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4122 *----------------------------------------------------------------------------*/
4123 
4124 static inline flag extractFloat128Sign( float128 a )
4125 {
4126 
4127     return a.high>>63;
4128 
4129 }
4130 
4131 /*----------------------------------------------------------------------------
4132 | Normalizes the subnormal quadruple-precision floating-point value
4133 | represented by the denormalized significand formed by the concatenation of
4134 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4135 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4136 | significand are stored at the location pointed to by `zSig0Ptr', and the
4137 | least significant 64 bits of the normalized significand are stored at the
4138 | location pointed to by `zSig1Ptr'.
4139 *----------------------------------------------------------------------------*/
4140 
4141 static void
4142  normalizeFloat128Subnormal(
4143      uint64_t aSig0,
4144      uint64_t aSig1,
4145      int32_t *zExpPtr,
4146      uint64_t *zSig0Ptr,
4147      uint64_t *zSig1Ptr
4148  )
4149 {
4150     int8_t shiftCount;
4151 
4152     if ( aSig0 == 0 ) {
4153         shiftCount = clz64(aSig1) - 15;
4154         if ( shiftCount < 0 ) {
4155             *zSig0Ptr = aSig1>>( - shiftCount );
4156             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4157         }
4158         else {
4159             *zSig0Ptr = aSig1<<shiftCount;
4160             *zSig1Ptr = 0;
4161         }
4162         *zExpPtr = - shiftCount - 63;
4163     }
4164     else {
4165         shiftCount = clz64(aSig0) - 15;
4166         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4167         *zExpPtr = 1 - shiftCount;
4168     }
4169 
4170 }
4171 
4172 /*----------------------------------------------------------------------------
4173 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4174 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4175 | floating-point value, returning the result.  After being shifted into the
4176 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4177 | added together to form the most significant 32 bits of the result.  This
4178 | means that any integer portion of `zSig0' will be added into the exponent.
4179 | Since a properly normalized significand will have an integer portion equal
4180 | to 1, the `zExp' input should be 1 less than the desired result exponent
4181 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4182 | significand.
4183 *----------------------------------------------------------------------------*/
4184 
4185 static inline float128
4186  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4187 {
4188     float128 z;
4189 
4190     z.low = zSig1;
4191     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4192     return z;
4193 
4194 }
4195 
4196 /*----------------------------------------------------------------------------
4197 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4198 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4199 | and `zSig2', and returns the proper quadruple-precision floating-point value
4200 | corresponding to the abstract input.  Ordinarily, the abstract value is
4201 | simply rounded and packed into the quadruple-precision format, with the
4202 | inexact exception raised if the abstract input cannot be represented
4203 | exactly.  However, if the abstract value is too large, the overflow and
4204 | inexact exceptions are raised and an infinity or maximal finite value is
4205 | returned.  If the abstract value is too small, the input value is rounded to
4206 | a subnormal number, and the underflow and inexact exceptions are raised if
4207 | the abstract input cannot be represented exactly as a subnormal quadruple-
4208 | precision floating-point number.
4209 |     The input significand must be normalized or smaller.  If the input
4210 | significand is not normalized, `zExp' must be 0; in that case, the result
4211 | returned is a subnormal number, and it must not require rounding.  In the
4212 | usual case that the input significand is normalized, `zExp' must be 1 less
4213 | than the ``true'' floating-point exponent.  The handling of underflow and
4214 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4215 *----------------------------------------------------------------------------*/
4216 
4217 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4218                                      uint64_t zSig0, uint64_t zSig1,
4219                                      uint64_t zSig2, float_status *status)
4220 {
4221     int8_t roundingMode;
4222     flag roundNearestEven, increment, isTiny;
4223 
4224     roundingMode = status->float_rounding_mode;
4225     roundNearestEven = ( roundingMode == float_round_nearest_even );
4226     switch (roundingMode) {
4227     case float_round_nearest_even:
4228     case float_round_ties_away:
4229         increment = ((int64_t)zSig2 < 0);
4230         break;
4231     case float_round_to_zero:
4232         increment = 0;
4233         break;
4234     case float_round_up:
4235         increment = !zSign && zSig2;
4236         break;
4237     case float_round_down:
4238         increment = zSign && zSig2;
4239         break;
4240     case float_round_to_odd:
4241         increment = !(zSig1 & 0x1) && zSig2;
4242         break;
4243     default:
4244         abort();
4245     }
4246     if ( 0x7FFD <= (uint32_t) zExp ) {
4247         if (    ( 0x7FFD < zExp )
4248              || (    ( zExp == 0x7FFD )
4249                   && eq128(
4250                          UINT64_C(0x0001FFFFFFFFFFFF),
4251                          UINT64_C(0xFFFFFFFFFFFFFFFF),
4252                          zSig0,
4253                          zSig1
4254                      )
4255                   && increment
4256                 )
4257            ) {
4258             float_raise(float_flag_overflow | float_flag_inexact, status);
4259             if (    ( roundingMode == float_round_to_zero )
4260                  || ( zSign && ( roundingMode == float_round_up ) )
4261                  || ( ! zSign && ( roundingMode == float_round_down ) )
4262                  || (roundingMode == float_round_to_odd)
4263                ) {
4264                 return
4265                     packFloat128(
4266                         zSign,
4267                         0x7FFE,
4268                         UINT64_C(0x0000FFFFFFFFFFFF),
4269                         UINT64_C(0xFFFFFFFFFFFFFFFF)
4270                     );
4271             }
4272             return packFloat128( zSign, 0x7FFF, 0, 0 );
4273         }
4274         if ( zExp < 0 ) {
4275             if (status->flush_to_zero) {
4276                 float_raise(float_flag_output_denormal, status);
4277                 return packFloat128(zSign, 0, 0, 0);
4278             }
4279             isTiny =
4280                    (status->float_detect_tininess
4281                     == float_tininess_before_rounding)
4282                 || ( zExp < -1 )
4283                 || ! increment
4284                 || lt128(
4285                        zSig0,
4286                        zSig1,
4287                        UINT64_C(0x0001FFFFFFFFFFFF),
4288                        UINT64_C(0xFFFFFFFFFFFFFFFF)
4289                    );
4290             shift128ExtraRightJamming(
4291                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4292             zExp = 0;
4293             if (isTiny && zSig2) {
4294                 float_raise(float_flag_underflow, status);
4295             }
4296             switch (roundingMode) {
4297             case float_round_nearest_even:
4298             case float_round_ties_away:
4299                 increment = ((int64_t)zSig2 < 0);
4300                 break;
4301             case float_round_to_zero:
4302                 increment = 0;
4303                 break;
4304             case float_round_up:
4305                 increment = !zSign && zSig2;
4306                 break;
4307             case float_round_down:
4308                 increment = zSign && zSig2;
4309                 break;
4310             case float_round_to_odd:
4311                 increment = !(zSig1 & 0x1) && zSig2;
4312                 break;
4313             default:
4314                 abort();
4315             }
4316         }
4317     }
4318     if (zSig2) {
4319         status->float_exception_flags |= float_flag_inexact;
4320     }
4321     if ( increment ) {
4322         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4323         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4324     }
4325     else {
4326         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4327     }
4328     return packFloat128( zSign, zExp, zSig0, zSig1 );
4329 
4330 }
4331 
4332 /*----------------------------------------------------------------------------
4333 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4334 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4335 | returns the proper quadruple-precision floating-point value corresponding
4336 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4337 | except that the input significand has fewer bits and does not have to be
4338 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4339 | point exponent.
4340 *----------------------------------------------------------------------------*/
4341 
4342 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4343                                               uint64_t zSig0, uint64_t zSig1,
4344                                               float_status *status)
4345 {
4346     int8_t shiftCount;
4347     uint64_t zSig2;
4348 
4349     if ( zSig0 == 0 ) {
4350         zSig0 = zSig1;
4351         zSig1 = 0;
4352         zExp -= 64;
4353     }
4354     shiftCount = clz64(zSig0) - 15;
4355     if ( 0 <= shiftCount ) {
4356         zSig2 = 0;
4357         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4358     }
4359     else {
4360         shift128ExtraRightJamming(
4361             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4362     }
4363     zExp -= shiftCount;
4364     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4365 
4366 }
4367 
4368 
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of converting the 32-bit two's complement integer `a'
4371 | to the extended double-precision floating-point format.  The conversion
4372 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4373 | Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375 
4376 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4377 {
4378     flag zSign;
4379     uint32_t absA;
4380     int8_t shiftCount;
4381     uint64_t zSig;
4382 
4383     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4384     zSign = ( a < 0 );
4385     absA = zSign ? - a : a;
4386     shiftCount = clz32(absA) + 32;
4387     zSig = absA;
4388     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4389 
4390 }
4391 
4392 /*----------------------------------------------------------------------------
4393 | Returns the result of converting the 32-bit two's complement integer `a' to
4394 | the quadruple-precision floating-point format.  The conversion is performed
4395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4396 *----------------------------------------------------------------------------*/
4397 
4398 float128 int32_to_float128(int32_t a, float_status *status)
4399 {
4400     flag zSign;
4401     uint32_t absA;
4402     int8_t shiftCount;
4403     uint64_t zSig0;
4404 
4405     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4406     zSign = ( a < 0 );
4407     absA = zSign ? - a : a;
4408     shiftCount = clz32(absA) + 17;
4409     zSig0 = absA;
4410     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4411 
4412 }
4413 
4414 /*----------------------------------------------------------------------------
4415 | Returns the result of converting the 64-bit two's complement integer `a'
4416 | to the extended double-precision floating-point format.  The conversion
4417 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4418 | Arithmetic.
4419 *----------------------------------------------------------------------------*/
4420 
4421 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4422 {
4423     flag zSign;
4424     uint64_t absA;
4425     int8_t shiftCount;
4426 
4427     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4428     zSign = ( a < 0 );
4429     absA = zSign ? - a : a;
4430     shiftCount = clz64(absA);
4431     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4432 
4433 }
4434 
4435 /*----------------------------------------------------------------------------
4436 | Returns the result of converting the 64-bit two's complement integer `a' to
4437 | the quadruple-precision floating-point format.  The conversion is performed
4438 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4439 *----------------------------------------------------------------------------*/
4440 
4441 float128 int64_to_float128(int64_t a, float_status *status)
4442 {
4443     flag zSign;
4444     uint64_t absA;
4445     int8_t shiftCount;
4446     int32_t zExp;
4447     uint64_t zSig0, zSig1;
4448 
4449     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4450     zSign = ( a < 0 );
4451     absA = zSign ? - a : a;
4452     shiftCount = clz64(absA) + 49;
4453     zExp = 0x406E - shiftCount;
4454     if ( 64 <= shiftCount ) {
4455         zSig1 = 0;
4456         zSig0 = absA;
4457         shiftCount -= 64;
4458     }
4459     else {
4460         zSig1 = absA;
4461         zSig0 = 0;
4462     }
4463     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4464     return packFloat128( zSign, zExp, zSig0, zSig1 );
4465 
4466 }
4467 
4468 /*----------------------------------------------------------------------------
4469 | Returns the result of converting the 64-bit unsigned integer `a'
4470 | to the quadruple-precision floating-point format.  The conversion is performed
4471 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4472 *----------------------------------------------------------------------------*/
4473 
4474 float128 uint64_to_float128(uint64_t a, float_status *status)
4475 {
4476     if (a == 0) {
4477         return float128_zero;
4478     }
4479     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4480 }
4481 
4482 /*----------------------------------------------------------------------------
4483 | Returns the result of converting the single-precision floating-point value
4484 | `a' to the extended double-precision floating-point format.  The conversion
4485 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4486 | Arithmetic.
4487 *----------------------------------------------------------------------------*/
4488 
4489 floatx80 float32_to_floatx80(float32 a, float_status *status)
4490 {
4491     flag aSign;
4492     int aExp;
4493     uint32_t aSig;
4494 
4495     a = float32_squash_input_denormal(a, status);
4496     aSig = extractFloat32Frac( a );
4497     aExp = extractFloat32Exp( a );
4498     aSign = extractFloat32Sign( a );
4499     if ( aExp == 0xFF ) {
4500         if (aSig) {
4501             floatx80 res = commonNaNToFloatx80(float32ToCommonNaN(a, status),
4502                                                status);
4503             return floatx80_silence_nan(res, status);
4504         }
4505         return packFloatx80(aSign,
4506                             floatx80_infinity_high,
4507                             floatx80_infinity_low);
4508     }
4509     if ( aExp == 0 ) {
4510         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4511         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4512     }
4513     aSig |= 0x00800000;
4514     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4515 
4516 }
4517 
4518 /*----------------------------------------------------------------------------
4519 | Returns the result of converting the single-precision floating-point value
4520 | `a' to the double-precision floating-point format.  The conversion is
4521 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4522 | Arithmetic.
4523 *----------------------------------------------------------------------------*/
4524 
4525 float128 float32_to_float128(float32 a, float_status *status)
4526 {
4527     flag aSign;
4528     int aExp;
4529     uint32_t aSig;
4530 
4531     a = float32_squash_input_denormal(a, status);
4532     aSig = extractFloat32Frac( a );
4533     aExp = extractFloat32Exp( a );
4534     aSign = extractFloat32Sign( a );
4535     if ( aExp == 0xFF ) {
4536         if (aSig) {
4537             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4538         }
4539         return packFloat128( aSign, 0x7FFF, 0, 0 );
4540     }
4541     if ( aExp == 0 ) {
4542         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4543         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4544         --aExp;
4545     }
4546     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4547 
4548 }
4549 
4550 /*----------------------------------------------------------------------------
4551 | Returns the remainder of the single-precision floating-point value `a'
4552 | with respect to the corresponding value `b'.  The operation is performed
4553 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4554 *----------------------------------------------------------------------------*/
4555 
4556 float32 float32_rem(float32 a, float32 b, float_status *status)
4557 {
4558     flag aSign, zSign;
4559     int aExp, bExp, expDiff;
4560     uint32_t aSig, bSig;
4561     uint32_t q;
4562     uint64_t aSig64, bSig64, q64;
4563     uint32_t alternateASig;
4564     int32_t sigMean;
4565     a = float32_squash_input_denormal(a, status);
4566     b = float32_squash_input_denormal(b, status);
4567 
4568     aSig = extractFloat32Frac( a );
4569     aExp = extractFloat32Exp( a );
4570     aSign = extractFloat32Sign( a );
4571     bSig = extractFloat32Frac( b );
4572     bExp = extractFloat32Exp( b );
4573     if ( aExp == 0xFF ) {
4574         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4575             return propagateFloat32NaN(a, b, status);
4576         }
4577         float_raise(float_flag_invalid, status);
4578         return float32_default_nan(status);
4579     }
4580     if ( bExp == 0xFF ) {
4581         if (bSig) {
4582             return propagateFloat32NaN(a, b, status);
4583         }
4584         return a;
4585     }
4586     if ( bExp == 0 ) {
4587         if ( bSig == 0 ) {
4588             float_raise(float_flag_invalid, status);
4589             return float32_default_nan(status);
4590         }
4591         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4592     }
4593     if ( aExp == 0 ) {
4594         if ( aSig == 0 ) return a;
4595         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4596     }
4597     expDiff = aExp - bExp;
4598     aSig |= 0x00800000;
4599     bSig |= 0x00800000;
4600     if ( expDiff < 32 ) {
4601         aSig <<= 8;
4602         bSig <<= 8;
4603         if ( expDiff < 0 ) {
4604             if ( expDiff < -1 ) return a;
4605             aSig >>= 1;
4606         }
4607         q = ( bSig <= aSig );
4608         if ( q ) aSig -= bSig;
4609         if ( 0 < expDiff ) {
4610             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4611             q >>= 32 - expDiff;
4612             bSig >>= 2;
4613             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4614         }
4615         else {
4616             aSig >>= 2;
4617             bSig >>= 2;
4618         }
4619     }
4620     else {
4621         if ( bSig <= aSig ) aSig -= bSig;
4622         aSig64 = ( (uint64_t) aSig )<<40;
4623         bSig64 = ( (uint64_t) bSig )<<40;
4624         expDiff -= 64;
4625         while ( 0 < expDiff ) {
4626             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4627             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4628             aSig64 = - ( ( bSig * q64 )<<38 );
4629             expDiff -= 62;
4630         }
4631         expDiff += 64;
4632         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4633         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4634         q = q64>>( 64 - expDiff );
4635         bSig <<= 6;
4636         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4637     }
4638     do {
4639         alternateASig = aSig;
4640         ++q;
4641         aSig -= bSig;
4642     } while ( 0 <= (int32_t) aSig );
4643     sigMean = aSig + alternateASig;
4644     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4645         aSig = alternateASig;
4646     }
4647     zSign = ( (int32_t) aSig < 0 );
4648     if ( zSign ) aSig = - aSig;
4649     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4650 }
4651 
4652 
4653 
4654 /*----------------------------------------------------------------------------
4655 | Returns the binary exponential of the single-precision floating-point value
4656 | `a'. The operation is performed according to the IEC/IEEE Standard for
4657 | Binary Floating-Point Arithmetic.
4658 |
4659 | Uses the following identities:
4660 |
4661 | 1. -------------------------------------------------------------------------
4662 |      x    x*ln(2)
4663 |     2  = e
4664 |
4665 | 2. -------------------------------------------------------------------------
4666 |                      2     3     4     5           n
4667 |      x        x     x     x     x     x           x
4668 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4669 |               1!    2!    3!    4!    5!          n!
4670 *----------------------------------------------------------------------------*/
4671 
4672 static const float64 float32_exp2_coefficients[15] =
4673 {
4674     const_float64( 0x3ff0000000000000ll ), /*  1 */
4675     const_float64( 0x3fe0000000000000ll ), /*  2 */
4676     const_float64( 0x3fc5555555555555ll ), /*  3 */
4677     const_float64( 0x3fa5555555555555ll ), /*  4 */
4678     const_float64( 0x3f81111111111111ll ), /*  5 */
4679     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4680     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4681     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4682     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4683     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4684     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4685     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4686     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4687     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4688     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4689 };
4690 
4691 float32 float32_exp2(float32 a, float_status *status)
4692 {
4693     flag aSign;
4694     int aExp;
4695     uint32_t aSig;
4696     float64 r, x, xn;
4697     int i;
4698     a = float32_squash_input_denormal(a, status);
4699 
4700     aSig = extractFloat32Frac( a );
4701     aExp = extractFloat32Exp( a );
4702     aSign = extractFloat32Sign( a );
4703 
4704     if ( aExp == 0xFF) {
4705         if (aSig) {
4706             return propagateFloat32NaN(a, float32_zero, status);
4707         }
4708         return (aSign) ? float32_zero : a;
4709     }
4710     if (aExp == 0) {
4711         if (aSig == 0) return float32_one;
4712     }
4713 
4714     float_raise(float_flag_inexact, status);
4715 
4716     /* ******************************* */
4717     /* using float64 for approximation */
4718     /* ******************************* */
4719     x = float32_to_float64(a, status);
4720     x = float64_mul(x, float64_ln2, status);
4721 
4722     xn = x;
4723     r = float64_one;
4724     for (i = 0 ; i < 15 ; i++) {
4725         float64 f;
4726 
4727         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4728         r = float64_add(r, f, status);
4729 
4730         xn = float64_mul(xn, x, status);
4731     }
4732 
4733     return float64_to_float32(r, status);
4734 }
4735 
4736 /*----------------------------------------------------------------------------
4737 | Returns the binary log of the single-precision floating-point value `a'.
4738 | The operation is performed according to the IEC/IEEE Standard for Binary
4739 | Floating-Point Arithmetic.
4740 *----------------------------------------------------------------------------*/
4741 float32 float32_log2(float32 a, float_status *status)
4742 {
4743     flag aSign, zSign;
4744     int aExp;
4745     uint32_t aSig, zSig, i;
4746 
4747     a = float32_squash_input_denormal(a, status);
4748     aSig = extractFloat32Frac( a );
4749     aExp = extractFloat32Exp( a );
4750     aSign = extractFloat32Sign( a );
4751 
4752     if ( aExp == 0 ) {
4753         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4754         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4755     }
4756     if ( aSign ) {
4757         float_raise(float_flag_invalid, status);
4758         return float32_default_nan(status);
4759     }
4760     if ( aExp == 0xFF ) {
4761         if (aSig) {
4762             return propagateFloat32NaN(a, float32_zero, status);
4763         }
4764         return a;
4765     }
4766 
4767     aExp -= 0x7F;
4768     aSig |= 0x00800000;
4769     zSign = aExp < 0;
4770     zSig = aExp << 23;
4771 
4772     for (i = 1 << 22; i > 0; i >>= 1) {
4773         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4774         if ( aSig & 0x01000000 ) {
4775             aSig >>= 1;
4776             zSig |= i;
4777         }
4778     }
4779 
4780     if ( zSign )
4781         zSig = -zSig;
4782 
4783     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4784 }
4785 
4786 /*----------------------------------------------------------------------------
4787 | Returns 1 if the single-precision floating-point value `a' is equal to
4788 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4789 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4790 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4791 *----------------------------------------------------------------------------*/
4792 
4793 int float32_eq(float32 a, float32 b, float_status *status)
4794 {
4795     uint32_t av, bv;
4796     a = float32_squash_input_denormal(a, status);
4797     b = float32_squash_input_denormal(b, status);
4798 
4799     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4800          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4801        ) {
4802         float_raise(float_flag_invalid, status);
4803         return 0;
4804     }
4805     av = float32_val(a);
4806     bv = float32_val(b);
4807     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4808 }
4809 
4810 /*----------------------------------------------------------------------------
4811 | Returns 1 if the single-precision floating-point value `a' is less than
4812 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4813 | exception is raised if either operand is a NaN.  The comparison is performed
4814 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4815 *----------------------------------------------------------------------------*/
4816 
4817 int float32_le(float32 a, float32 b, float_status *status)
4818 {
4819     flag aSign, bSign;
4820     uint32_t av, bv;
4821     a = float32_squash_input_denormal(a, status);
4822     b = float32_squash_input_denormal(b, status);
4823 
4824     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4825          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4826        ) {
4827         float_raise(float_flag_invalid, status);
4828         return 0;
4829     }
4830     aSign = extractFloat32Sign( a );
4831     bSign = extractFloat32Sign( b );
4832     av = float32_val(a);
4833     bv = float32_val(b);
4834     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4835     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4836 
4837 }
4838 
4839 /*----------------------------------------------------------------------------
4840 | Returns 1 if the single-precision floating-point value `a' is less than
4841 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4842 | raised if either operand is a NaN.  The comparison is performed according
4843 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4844 *----------------------------------------------------------------------------*/
4845 
4846 int float32_lt(float32 a, float32 b, float_status *status)
4847 {
4848     flag aSign, bSign;
4849     uint32_t av, bv;
4850     a = float32_squash_input_denormal(a, status);
4851     b = float32_squash_input_denormal(b, status);
4852 
4853     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4854          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4855        ) {
4856         float_raise(float_flag_invalid, status);
4857         return 0;
4858     }
4859     aSign = extractFloat32Sign( a );
4860     bSign = extractFloat32Sign( b );
4861     av = float32_val(a);
4862     bv = float32_val(b);
4863     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4864     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4865 
4866 }
4867 
4868 /*----------------------------------------------------------------------------
4869 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4870 | be compared, and 0 otherwise.  The invalid exception is raised if either
4871 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4872 | Standard for Binary Floating-Point Arithmetic.
4873 *----------------------------------------------------------------------------*/
4874 
4875 int float32_unordered(float32 a, float32 b, float_status *status)
4876 {
4877     a = float32_squash_input_denormal(a, status);
4878     b = float32_squash_input_denormal(b, status);
4879 
4880     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4881          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4882        ) {
4883         float_raise(float_flag_invalid, status);
4884         return 1;
4885     }
4886     return 0;
4887 }
4888 
4889 /*----------------------------------------------------------------------------
4890 | Returns 1 if the single-precision floating-point value `a' is equal to
4891 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4892 | exception.  The comparison is performed according to the IEC/IEEE Standard
4893 | for Binary Floating-Point Arithmetic.
4894 *----------------------------------------------------------------------------*/
4895 
4896 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4897 {
4898     a = float32_squash_input_denormal(a, status);
4899     b = float32_squash_input_denormal(b, status);
4900 
4901     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4902          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4903        ) {
4904         if (float32_is_signaling_nan(a, status)
4905          || float32_is_signaling_nan(b, status)) {
4906             float_raise(float_flag_invalid, status);
4907         }
4908         return 0;
4909     }
4910     return ( float32_val(a) == float32_val(b) ) ||
4911             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4912 }
4913 
4914 /*----------------------------------------------------------------------------
4915 | Returns 1 if the single-precision floating-point value `a' is less than or
4916 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4917 | cause an exception.  Otherwise, the comparison is performed according to the
4918 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4919 *----------------------------------------------------------------------------*/
4920 
4921 int float32_le_quiet(float32 a, float32 b, float_status *status)
4922 {
4923     flag aSign, bSign;
4924     uint32_t av, bv;
4925     a = float32_squash_input_denormal(a, status);
4926     b = float32_squash_input_denormal(b, status);
4927 
4928     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4929          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4930        ) {
4931         if (float32_is_signaling_nan(a, status)
4932          || float32_is_signaling_nan(b, status)) {
4933             float_raise(float_flag_invalid, status);
4934         }
4935         return 0;
4936     }
4937     aSign = extractFloat32Sign( a );
4938     bSign = extractFloat32Sign( b );
4939     av = float32_val(a);
4940     bv = float32_val(b);
4941     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4942     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4943 
4944 }
4945 
4946 /*----------------------------------------------------------------------------
4947 | Returns 1 if the single-precision floating-point value `a' is less than
4948 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4949 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4950 | Standard for Binary Floating-Point Arithmetic.
4951 *----------------------------------------------------------------------------*/
4952 
4953 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4954 {
4955     flag aSign, bSign;
4956     uint32_t av, bv;
4957     a = float32_squash_input_denormal(a, status);
4958     b = float32_squash_input_denormal(b, status);
4959 
4960     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4961          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4962        ) {
4963         if (float32_is_signaling_nan(a, status)
4964          || float32_is_signaling_nan(b, status)) {
4965             float_raise(float_flag_invalid, status);
4966         }
4967         return 0;
4968     }
4969     aSign = extractFloat32Sign( a );
4970     bSign = extractFloat32Sign( b );
4971     av = float32_val(a);
4972     bv = float32_val(b);
4973     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4974     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4975 
4976 }
4977 
4978 /*----------------------------------------------------------------------------
4979 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4980 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4981 | comparison is performed according to the IEC/IEEE Standard for Binary
4982 | Floating-Point Arithmetic.
4983 *----------------------------------------------------------------------------*/
4984 
4985 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4986 {
4987     a = float32_squash_input_denormal(a, status);
4988     b = float32_squash_input_denormal(b, status);
4989 
4990     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4991          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4992        ) {
4993         if (float32_is_signaling_nan(a, status)
4994          || float32_is_signaling_nan(b, status)) {
4995             float_raise(float_flag_invalid, status);
4996         }
4997         return 1;
4998     }
4999     return 0;
5000 }
5001 
5002 /*----------------------------------------------------------------------------
5003 | Returns the result of converting the double-precision floating-point value
5004 | `a' to the extended double-precision floating-point format.  The conversion
5005 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5006 | Arithmetic.
5007 *----------------------------------------------------------------------------*/
5008 
5009 floatx80 float64_to_floatx80(float64 a, float_status *status)
5010 {
5011     flag aSign;
5012     int aExp;
5013     uint64_t aSig;
5014 
5015     a = float64_squash_input_denormal(a, status);
5016     aSig = extractFloat64Frac( a );
5017     aExp = extractFloat64Exp( a );
5018     aSign = extractFloat64Sign( a );
5019     if ( aExp == 0x7FF ) {
5020         if (aSig) {
5021             floatx80 res = commonNaNToFloatx80(float64ToCommonNaN(a, status),
5022                                                status);
5023             return floatx80_silence_nan(res, status);
5024         }
5025         return packFloatx80(aSign,
5026                             floatx80_infinity_high,
5027                             floatx80_infinity_low);
5028     }
5029     if ( aExp == 0 ) {
5030         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5031         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5032     }
5033     return
5034         packFloatx80(
5035             aSign, aExp + 0x3C00, (aSig | UINT64_C(0x0010000000000000)) << 11);
5036 
5037 }
5038 
5039 /*----------------------------------------------------------------------------
5040 | Returns the result of converting the double-precision floating-point value
5041 | `a' to the quadruple-precision floating-point format.  The conversion is
5042 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5043 | Arithmetic.
5044 *----------------------------------------------------------------------------*/
5045 
5046 float128 float64_to_float128(float64 a, float_status *status)
5047 {
5048     flag aSign;
5049     int aExp;
5050     uint64_t aSig, zSig0, zSig1;
5051 
5052     a = float64_squash_input_denormal(a, status);
5053     aSig = extractFloat64Frac( a );
5054     aExp = extractFloat64Exp( a );
5055     aSign = extractFloat64Sign( a );
5056     if ( aExp == 0x7FF ) {
5057         if (aSig) {
5058             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5059         }
5060         return packFloat128( aSign, 0x7FFF, 0, 0 );
5061     }
5062     if ( aExp == 0 ) {
5063         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5064         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5065         --aExp;
5066     }
5067     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5068     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5069 
5070 }
5071 
5072 
5073 /*----------------------------------------------------------------------------
5074 | Returns the remainder of the double-precision floating-point value `a'
5075 | with respect to the corresponding value `b'.  The operation is performed
5076 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078 
5079 float64 float64_rem(float64 a, float64 b, float_status *status)
5080 {
5081     flag aSign, zSign;
5082     int aExp, bExp, expDiff;
5083     uint64_t aSig, bSig;
5084     uint64_t q, alternateASig;
5085     int64_t sigMean;
5086 
5087     a = float64_squash_input_denormal(a, status);
5088     b = float64_squash_input_denormal(b, status);
5089     aSig = extractFloat64Frac( a );
5090     aExp = extractFloat64Exp( a );
5091     aSign = extractFloat64Sign( a );
5092     bSig = extractFloat64Frac( b );
5093     bExp = extractFloat64Exp( b );
5094     if ( aExp == 0x7FF ) {
5095         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5096             return propagateFloat64NaN(a, b, status);
5097         }
5098         float_raise(float_flag_invalid, status);
5099         return float64_default_nan(status);
5100     }
5101     if ( bExp == 0x7FF ) {
5102         if (bSig) {
5103             return propagateFloat64NaN(a, b, status);
5104         }
5105         return a;
5106     }
5107     if ( bExp == 0 ) {
5108         if ( bSig == 0 ) {
5109             float_raise(float_flag_invalid, status);
5110             return float64_default_nan(status);
5111         }
5112         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5113     }
5114     if ( aExp == 0 ) {
5115         if ( aSig == 0 ) return a;
5116         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5117     }
5118     expDiff = aExp - bExp;
5119     aSig = (aSig | UINT64_C(0x0010000000000000)) << 11;
5120     bSig = (bSig | UINT64_C(0x0010000000000000)) << 11;
5121     if ( expDiff < 0 ) {
5122         if ( expDiff < -1 ) return a;
5123         aSig >>= 1;
5124     }
5125     q = ( bSig <= aSig );
5126     if ( q ) aSig -= bSig;
5127     expDiff -= 64;
5128     while ( 0 < expDiff ) {
5129         q = estimateDiv128To64( aSig, 0, bSig );
5130         q = ( 2 < q ) ? q - 2 : 0;
5131         aSig = - ( ( bSig>>2 ) * q );
5132         expDiff -= 62;
5133     }
5134     expDiff += 64;
5135     if ( 0 < expDiff ) {
5136         q = estimateDiv128To64( aSig, 0, bSig );
5137         q = ( 2 < q ) ? q - 2 : 0;
5138         q >>= 64 - expDiff;
5139         bSig >>= 2;
5140         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5141     }
5142     else {
5143         aSig >>= 2;
5144         bSig >>= 2;
5145     }
5146     do {
5147         alternateASig = aSig;
5148         ++q;
5149         aSig -= bSig;
5150     } while ( 0 <= (int64_t) aSig );
5151     sigMean = aSig + alternateASig;
5152     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5153         aSig = alternateASig;
5154     }
5155     zSign = ( (int64_t) aSig < 0 );
5156     if ( zSign ) aSig = - aSig;
5157     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5158 
5159 }
5160 
5161 /*----------------------------------------------------------------------------
5162 | Returns the binary log of the double-precision floating-point value `a'.
5163 | The operation is performed according to the IEC/IEEE Standard for Binary
5164 | Floating-Point Arithmetic.
5165 *----------------------------------------------------------------------------*/
5166 float64 float64_log2(float64 a, float_status *status)
5167 {
5168     flag aSign, zSign;
5169     int aExp;
5170     uint64_t aSig, aSig0, aSig1, zSig, i;
5171     a = float64_squash_input_denormal(a, status);
5172 
5173     aSig = extractFloat64Frac( a );
5174     aExp = extractFloat64Exp( a );
5175     aSign = extractFloat64Sign( a );
5176 
5177     if ( aExp == 0 ) {
5178         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5179         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5180     }
5181     if ( aSign ) {
5182         float_raise(float_flag_invalid, status);
5183         return float64_default_nan(status);
5184     }
5185     if ( aExp == 0x7FF ) {
5186         if (aSig) {
5187             return propagateFloat64NaN(a, float64_zero, status);
5188         }
5189         return a;
5190     }
5191 
5192     aExp -= 0x3FF;
5193     aSig |= UINT64_C(0x0010000000000000);
5194     zSign = aExp < 0;
5195     zSig = (uint64_t)aExp << 52;
5196     for (i = 1LL << 51; i > 0; i >>= 1) {
5197         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5198         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5199         if ( aSig & UINT64_C(0x0020000000000000) ) {
5200             aSig >>= 1;
5201             zSig |= i;
5202         }
5203     }
5204 
5205     if ( zSign )
5206         zSig = -zSig;
5207     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5208 }
5209 
5210 /*----------------------------------------------------------------------------
5211 | Returns 1 if the double-precision floating-point value `a' is equal to the
5212 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5213 | if either operand is a NaN.  Otherwise, the comparison is performed
5214 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5215 *----------------------------------------------------------------------------*/
5216 
5217 int float64_eq(float64 a, float64 b, float_status *status)
5218 {
5219     uint64_t av, bv;
5220     a = float64_squash_input_denormal(a, status);
5221     b = float64_squash_input_denormal(b, status);
5222 
5223     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5224          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5225        ) {
5226         float_raise(float_flag_invalid, status);
5227         return 0;
5228     }
5229     av = float64_val(a);
5230     bv = float64_val(b);
5231     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5232 
5233 }
5234 
5235 /*----------------------------------------------------------------------------
5236 | Returns 1 if the double-precision floating-point value `a' is less than or
5237 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5238 | exception is raised if either operand is a NaN.  The comparison is performed
5239 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5240 *----------------------------------------------------------------------------*/
5241 
5242 int float64_le(float64 a, float64 b, float_status *status)
5243 {
5244     flag aSign, bSign;
5245     uint64_t av, bv;
5246     a = float64_squash_input_denormal(a, status);
5247     b = float64_squash_input_denormal(b, status);
5248 
5249     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5250          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5251        ) {
5252         float_raise(float_flag_invalid, status);
5253         return 0;
5254     }
5255     aSign = extractFloat64Sign( a );
5256     bSign = extractFloat64Sign( b );
5257     av = float64_val(a);
5258     bv = float64_val(b);
5259     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5260     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5261 
5262 }
5263 
5264 /*----------------------------------------------------------------------------
5265 | Returns 1 if the double-precision floating-point value `a' is less than
5266 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5267 | raised if either operand is a NaN.  The comparison is performed according
5268 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5269 *----------------------------------------------------------------------------*/
5270 
5271 int float64_lt(float64 a, float64 b, float_status *status)
5272 {
5273     flag aSign, bSign;
5274     uint64_t av, bv;
5275 
5276     a = float64_squash_input_denormal(a, status);
5277     b = float64_squash_input_denormal(b, status);
5278     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5279          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5280        ) {
5281         float_raise(float_flag_invalid, status);
5282         return 0;
5283     }
5284     aSign = extractFloat64Sign( a );
5285     bSign = extractFloat64Sign( b );
5286     av = float64_val(a);
5287     bv = float64_val(b);
5288     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5289     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5290 
5291 }
5292 
5293 /*----------------------------------------------------------------------------
5294 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5295 | be compared, and 0 otherwise.  The invalid exception is raised if either
5296 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5297 | Standard for Binary Floating-Point Arithmetic.
5298 *----------------------------------------------------------------------------*/
5299 
5300 int float64_unordered(float64 a, float64 b, float_status *status)
5301 {
5302     a = float64_squash_input_denormal(a, status);
5303     b = float64_squash_input_denormal(b, status);
5304 
5305     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5306          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5307        ) {
5308         float_raise(float_flag_invalid, status);
5309         return 1;
5310     }
5311     return 0;
5312 }
5313 
5314 /*----------------------------------------------------------------------------
5315 | Returns 1 if the double-precision floating-point value `a' is equal to the
5316 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5317 | exception.The comparison is performed according to the IEC/IEEE Standard
5318 | for Binary Floating-Point Arithmetic.
5319 *----------------------------------------------------------------------------*/
5320 
5321 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5322 {
5323     uint64_t av, bv;
5324     a = float64_squash_input_denormal(a, status);
5325     b = float64_squash_input_denormal(b, status);
5326 
5327     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5328          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5329        ) {
5330         if (float64_is_signaling_nan(a, status)
5331          || float64_is_signaling_nan(b, status)) {
5332             float_raise(float_flag_invalid, status);
5333         }
5334         return 0;
5335     }
5336     av = float64_val(a);
5337     bv = float64_val(b);
5338     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5339 
5340 }
5341 
5342 /*----------------------------------------------------------------------------
5343 | Returns 1 if the double-precision floating-point value `a' is less than or
5344 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5345 | cause an exception.  Otherwise, the comparison is performed according to the
5346 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5347 *----------------------------------------------------------------------------*/
5348 
5349 int float64_le_quiet(float64 a, float64 b, float_status *status)
5350 {
5351     flag aSign, bSign;
5352     uint64_t av, bv;
5353     a = float64_squash_input_denormal(a, status);
5354     b = float64_squash_input_denormal(b, status);
5355 
5356     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5357          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5358        ) {
5359         if (float64_is_signaling_nan(a, status)
5360          || float64_is_signaling_nan(b, status)) {
5361             float_raise(float_flag_invalid, status);
5362         }
5363         return 0;
5364     }
5365     aSign = extractFloat64Sign( a );
5366     bSign = extractFloat64Sign( b );
5367     av = float64_val(a);
5368     bv = float64_val(b);
5369     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5370     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5371 
5372 }
5373 
5374 /*----------------------------------------------------------------------------
5375 | Returns 1 if the double-precision floating-point value `a' is less than
5376 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5377 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5378 | Standard for Binary Floating-Point Arithmetic.
5379 *----------------------------------------------------------------------------*/
5380 
5381 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5382 {
5383     flag aSign, bSign;
5384     uint64_t av, bv;
5385     a = float64_squash_input_denormal(a, status);
5386     b = float64_squash_input_denormal(b, status);
5387 
5388     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5389          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5390        ) {
5391         if (float64_is_signaling_nan(a, status)
5392          || float64_is_signaling_nan(b, status)) {
5393             float_raise(float_flag_invalid, status);
5394         }
5395         return 0;
5396     }
5397     aSign = extractFloat64Sign( a );
5398     bSign = extractFloat64Sign( b );
5399     av = float64_val(a);
5400     bv = float64_val(b);
5401     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5402     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5403 
5404 }
5405 
5406 /*----------------------------------------------------------------------------
5407 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5408 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5409 | comparison is performed according to the IEC/IEEE Standard for Binary
5410 | Floating-Point Arithmetic.
5411 *----------------------------------------------------------------------------*/
5412 
5413 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5414 {
5415     a = float64_squash_input_denormal(a, status);
5416     b = float64_squash_input_denormal(b, status);
5417 
5418     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5419          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5420        ) {
5421         if (float64_is_signaling_nan(a, status)
5422          || float64_is_signaling_nan(b, status)) {
5423             float_raise(float_flag_invalid, status);
5424         }
5425         return 1;
5426     }
5427     return 0;
5428 }
5429 
5430 /*----------------------------------------------------------------------------
5431 | Returns the result of converting the extended double-precision floating-
5432 | point value `a' to the 32-bit two's complement integer format.  The
5433 | conversion is performed according to the IEC/IEEE Standard for Binary
5434 | Floating-Point Arithmetic---which means in particular that the conversion
5435 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5436 | largest positive integer is returned.  Otherwise, if the conversion
5437 | overflows, the largest integer with the same sign as `a' is returned.
5438 *----------------------------------------------------------------------------*/
5439 
5440 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5441 {
5442     flag aSign;
5443     int32_t aExp, shiftCount;
5444     uint64_t aSig;
5445 
5446     if (floatx80_invalid_encoding(a)) {
5447         float_raise(float_flag_invalid, status);
5448         return 1 << 31;
5449     }
5450     aSig = extractFloatx80Frac( a );
5451     aExp = extractFloatx80Exp( a );
5452     aSign = extractFloatx80Sign( a );
5453     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5454     shiftCount = 0x4037 - aExp;
5455     if ( shiftCount <= 0 ) shiftCount = 1;
5456     shift64RightJamming( aSig, shiftCount, &aSig );
5457     return roundAndPackInt32(aSign, aSig, status);
5458 
5459 }
5460 
5461 /*----------------------------------------------------------------------------
5462 | Returns the result of converting the extended double-precision floating-
5463 | point value `a' to the 32-bit two's complement integer format.  The
5464 | conversion is performed according to the IEC/IEEE Standard for Binary
5465 | Floating-Point Arithmetic, except that the conversion is always rounded
5466 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5467 | Otherwise, if the conversion overflows, the largest integer with the same
5468 | sign as `a' is returned.
5469 *----------------------------------------------------------------------------*/
5470 
5471 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5472 {
5473     flag aSign;
5474     int32_t aExp, shiftCount;
5475     uint64_t aSig, savedASig;
5476     int32_t z;
5477 
5478     if (floatx80_invalid_encoding(a)) {
5479         float_raise(float_flag_invalid, status);
5480         return 1 << 31;
5481     }
5482     aSig = extractFloatx80Frac( a );
5483     aExp = extractFloatx80Exp( a );
5484     aSign = extractFloatx80Sign( a );
5485     if ( 0x401E < aExp ) {
5486         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5487         goto invalid;
5488     }
5489     else if ( aExp < 0x3FFF ) {
5490         if (aExp || aSig) {
5491             status->float_exception_flags |= float_flag_inexact;
5492         }
5493         return 0;
5494     }
5495     shiftCount = 0x403E - aExp;
5496     savedASig = aSig;
5497     aSig >>= shiftCount;
5498     z = aSig;
5499     if ( aSign ) z = - z;
5500     if ( ( z < 0 ) ^ aSign ) {
5501  invalid:
5502         float_raise(float_flag_invalid, status);
5503         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5504     }
5505     if ( ( aSig<<shiftCount ) != savedASig ) {
5506         status->float_exception_flags |= float_flag_inexact;
5507     }
5508     return z;
5509 
5510 }
5511 
5512 /*----------------------------------------------------------------------------
5513 | Returns the result of converting the extended double-precision floating-
5514 | point value `a' to the 64-bit two's complement integer format.  The
5515 | conversion is performed according to the IEC/IEEE Standard for Binary
5516 | Floating-Point Arithmetic---which means in particular that the conversion
5517 | is rounded according to the current rounding mode.  If `a' is a NaN,
5518 | the largest positive integer is returned.  Otherwise, if the conversion
5519 | overflows, the largest integer with the same sign as `a' is returned.
5520 *----------------------------------------------------------------------------*/
5521 
5522 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5523 {
5524     flag aSign;
5525     int32_t aExp, shiftCount;
5526     uint64_t aSig, aSigExtra;
5527 
5528     if (floatx80_invalid_encoding(a)) {
5529         float_raise(float_flag_invalid, status);
5530         return 1ULL << 63;
5531     }
5532     aSig = extractFloatx80Frac( a );
5533     aExp = extractFloatx80Exp( a );
5534     aSign = extractFloatx80Sign( a );
5535     shiftCount = 0x403E - aExp;
5536     if ( shiftCount <= 0 ) {
5537         if ( shiftCount ) {
5538             float_raise(float_flag_invalid, status);
5539             if (!aSign || floatx80_is_any_nan(a)) {
5540                 return INT64_MAX;
5541             }
5542             return INT64_MIN;
5543         }
5544         aSigExtra = 0;
5545     }
5546     else {
5547         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5548     }
5549     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5550 
5551 }
5552 
5553 /*----------------------------------------------------------------------------
5554 | Returns the result of converting the extended double-precision floating-
5555 | point value `a' to the 64-bit two's complement integer format.  The
5556 | conversion is performed according to the IEC/IEEE Standard for Binary
5557 | Floating-Point Arithmetic, except that the conversion is always rounded
5558 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5559 | Otherwise, if the conversion overflows, the largest integer with the same
5560 | sign as `a' is returned.
5561 *----------------------------------------------------------------------------*/
5562 
5563 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5564 {
5565     flag aSign;
5566     int32_t aExp, shiftCount;
5567     uint64_t aSig;
5568     int64_t z;
5569 
5570     if (floatx80_invalid_encoding(a)) {
5571         float_raise(float_flag_invalid, status);
5572         return 1ULL << 63;
5573     }
5574     aSig = extractFloatx80Frac( a );
5575     aExp = extractFloatx80Exp( a );
5576     aSign = extractFloatx80Sign( a );
5577     shiftCount = aExp - 0x403E;
5578     if ( 0 <= shiftCount ) {
5579         aSig &= UINT64_C(0x7FFFFFFFFFFFFFFF);
5580         if ( ( a.high != 0xC03E ) || aSig ) {
5581             float_raise(float_flag_invalid, status);
5582             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5583                 return INT64_MAX;
5584             }
5585         }
5586         return INT64_MIN;
5587     }
5588     else if ( aExp < 0x3FFF ) {
5589         if (aExp | aSig) {
5590             status->float_exception_flags |= float_flag_inexact;
5591         }
5592         return 0;
5593     }
5594     z = aSig>>( - shiftCount );
5595     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5596         status->float_exception_flags |= float_flag_inexact;
5597     }
5598     if ( aSign ) z = - z;
5599     return z;
5600 
5601 }
5602 
5603 /*----------------------------------------------------------------------------
5604 | Returns the result of converting the extended double-precision floating-
5605 | point value `a' to the single-precision floating-point format.  The
5606 | conversion is performed according to the IEC/IEEE Standard for Binary
5607 | Floating-Point Arithmetic.
5608 *----------------------------------------------------------------------------*/
5609 
5610 float32 floatx80_to_float32(floatx80 a, float_status *status)
5611 {
5612     flag aSign;
5613     int32_t aExp;
5614     uint64_t aSig;
5615 
5616     if (floatx80_invalid_encoding(a)) {
5617         float_raise(float_flag_invalid, status);
5618         return float32_default_nan(status);
5619     }
5620     aSig = extractFloatx80Frac( a );
5621     aExp = extractFloatx80Exp( a );
5622     aSign = extractFloatx80Sign( a );
5623     if ( aExp == 0x7FFF ) {
5624         if ( (uint64_t) ( aSig<<1 ) ) {
5625             float32 res = commonNaNToFloat32(floatx80ToCommonNaN(a, status),
5626                                              status);
5627             return float32_silence_nan(res, status);
5628         }
5629         return packFloat32( aSign, 0xFF, 0 );
5630     }
5631     shift64RightJamming( aSig, 33, &aSig );
5632     if ( aExp || aSig ) aExp -= 0x3F81;
5633     return roundAndPackFloat32(aSign, aExp, aSig, status);
5634 
5635 }
5636 
5637 /*----------------------------------------------------------------------------
5638 | Returns the result of converting the extended double-precision floating-
5639 | point value `a' to the double-precision floating-point format.  The
5640 | conversion is performed according to the IEC/IEEE Standard for Binary
5641 | Floating-Point Arithmetic.
5642 *----------------------------------------------------------------------------*/
5643 
5644 float64 floatx80_to_float64(floatx80 a, float_status *status)
5645 {
5646     flag aSign;
5647     int32_t aExp;
5648     uint64_t aSig, zSig;
5649 
5650     if (floatx80_invalid_encoding(a)) {
5651         float_raise(float_flag_invalid, status);
5652         return float64_default_nan(status);
5653     }
5654     aSig = extractFloatx80Frac( a );
5655     aExp = extractFloatx80Exp( a );
5656     aSign = extractFloatx80Sign( a );
5657     if ( aExp == 0x7FFF ) {
5658         if ( (uint64_t) ( aSig<<1 ) ) {
5659             float64 res = commonNaNToFloat64(floatx80ToCommonNaN(a, status),
5660                                              status);
5661             return float64_silence_nan(res, status);
5662         }
5663         return packFloat64( aSign, 0x7FF, 0 );
5664     }
5665     shift64RightJamming( aSig, 1, &zSig );
5666     if ( aExp || aSig ) aExp -= 0x3C01;
5667     return roundAndPackFloat64(aSign, aExp, zSig, status);
5668 
5669 }
5670 
5671 /*----------------------------------------------------------------------------
5672 | Returns the result of converting the extended double-precision floating-
5673 | point value `a' to the quadruple-precision floating-point format.  The
5674 | conversion is performed according to the IEC/IEEE Standard for Binary
5675 | Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5677 
5678 float128 floatx80_to_float128(floatx80 a, float_status *status)
5679 {
5680     flag aSign;
5681     int aExp;
5682     uint64_t aSig, zSig0, zSig1;
5683 
5684     if (floatx80_invalid_encoding(a)) {
5685         float_raise(float_flag_invalid, status);
5686         return float128_default_nan(status);
5687     }
5688     aSig = extractFloatx80Frac( a );
5689     aExp = extractFloatx80Exp( a );
5690     aSign = extractFloatx80Sign( a );
5691     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5692         float128 res = commonNaNToFloat128(floatx80ToCommonNaN(a, status),
5693                                            status);
5694         return float128_silence_nan(res, status);
5695     }
5696     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5697     return packFloat128( aSign, aExp, zSig0, zSig1 );
5698 
5699 }
5700 
5701 /*----------------------------------------------------------------------------
5702 | Rounds the extended double-precision floating-point value `a'
5703 | to the precision provided by floatx80_rounding_precision and returns the
5704 | result as an extended double-precision floating-point value.
5705 | The operation is performed according to the IEC/IEEE Standard for Binary
5706 | Floating-Point Arithmetic.
5707 *----------------------------------------------------------------------------*/
5708 
5709 floatx80 floatx80_round(floatx80 a, float_status *status)
5710 {
5711     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5712                                 extractFloatx80Sign(a),
5713                                 extractFloatx80Exp(a),
5714                                 extractFloatx80Frac(a), 0, status);
5715 }
5716 
5717 /*----------------------------------------------------------------------------
5718 | Rounds the extended double-precision floating-point value `a' to an integer,
5719 | and returns the result as an extended quadruple-precision floating-point
5720 | value.  The operation is performed according to the IEC/IEEE Standard for
5721 | Binary Floating-Point Arithmetic.
5722 *----------------------------------------------------------------------------*/
5723 
5724 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5725 {
5726     flag aSign;
5727     int32_t aExp;
5728     uint64_t lastBitMask, roundBitsMask;
5729     floatx80 z;
5730 
5731     if (floatx80_invalid_encoding(a)) {
5732         float_raise(float_flag_invalid, status);
5733         return floatx80_default_nan(status);
5734     }
5735     aExp = extractFloatx80Exp( a );
5736     if ( 0x403E <= aExp ) {
5737         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5738             return propagateFloatx80NaN(a, a, status);
5739         }
5740         return a;
5741     }
5742     if ( aExp < 0x3FFF ) {
5743         if (    ( aExp == 0 )
5744              && ( (uint64_t) ( extractFloatx80Frac( a ) ) == 0 ) ) {
5745             return a;
5746         }
5747         status->float_exception_flags |= float_flag_inexact;
5748         aSign = extractFloatx80Sign( a );
5749         switch (status->float_rounding_mode) {
5750          case float_round_nearest_even:
5751             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5752                ) {
5753                 return
5754                     packFloatx80( aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5755             }
5756             break;
5757         case float_round_ties_away:
5758             if (aExp == 0x3FFE) {
5759                 return packFloatx80(aSign, 0x3FFF, UINT64_C(0x8000000000000000));
5760             }
5761             break;
5762          case float_round_down:
5763             return
5764                   aSign ?
5765                       packFloatx80( 1, 0x3FFF, UINT64_C(0x8000000000000000))
5766                 : packFloatx80( 0, 0, 0 );
5767          case float_round_up:
5768             return
5769                   aSign ? packFloatx80( 1, 0, 0 )
5770                 : packFloatx80( 0, 0x3FFF, UINT64_C(0x8000000000000000));
5771         }
5772         return packFloatx80( aSign, 0, 0 );
5773     }
5774     lastBitMask = 1;
5775     lastBitMask <<= 0x403E - aExp;
5776     roundBitsMask = lastBitMask - 1;
5777     z = a;
5778     switch (status->float_rounding_mode) {
5779     case float_round_nearest_even:
5780         z.low += lastBitMask>>1;
5781         if ((z.low & roundBitsMask) == 0) {
5782             z.low &= ~lastBitMask;
5783         }
5784         break;
5785     case float_round_ties_away:
5786         z.low += lastBitMask >> 1;
5787         break;
5788     case float_round_to_zero:
5789         break;
5790     case float_round_up:
5791         if (!extractFloatx80Sign(z)) {
5792             z.low += roundBitsMask;
5793         }
5794         break;
5795     case float_round_down:
5796         if (extractFloatx80Sign(z)) {
5797             z.low += roundBitsMask;
5798         }
5799         break;
5800     default:
5801         abort();
5802     }
5803     z.low &= ~ roundBitsMask;
5804     if ( z.low == 0 ) {
5805         ++z.high;
5806         z.low = UINT64_C(0x8000000000000000);
5807     }
5808     if (z.low != a.low) {
5809         status->float_exception_flags |= float_flag_inexact;
5810     }
5811     return z;
5812 
5813 }
5814 
5815 /*----------------------------------------------------------------------------
5816 | Returns the result of adding the absolute values of the extended double-
5817 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5818 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5819 | The addition is performed according to the IEC/IEEE Standard for Binary
5820 | Floating-Point Arithmetic.
5821 *----------------------------------------------------------------------------*/
5822 
5823 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5824                                 float_status *status)
5825 {
5826     int32_t aExp, bExp, zExp;
5827     uint64_t aSig, bSig, zSig0, zSig1;
5828     int32_t expDiff;
5829 
5830     aSig = extractFloatx80Frac( a );
5831     aExp = extractFloatx80Exp( a );
5832     bSig = extractFloatx80Frac( b );
5833     bExp = extractFloatx80Exp( b );
5834     expDiff = aExp - bExp;
5835     if ( 0 < expDiff ) {
5836         if ( aExp == 0x7FFF ) {
5837             if ((uint64_t)(aSig << 1)) {
5838                 return propagateFloatx80NaN(a, b, status);
5839             }
5840             return a;
5841         }
5842         if ( bExp == 0 ) --expDiff;
5843         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5844         zExp = aExp;
5845     }
5846     else if ( expDiff < 0 ) {
5847         if ( bExp == 0x7FFF ) {
5848             if ((uint64_t)(bSig << 1)) {
5849                 return propagateFloatx80NaN(a, b, status);
5850             }
5851             return packFloatx80(zSign,
5852                                 floatx80_infinity_high,
5853                                 floatx80_infinity_low);
5854         }
5855         if ( aExp == 0 ) ++expDiff;
5856         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5857         zExp = bExp;
5858     }
5859     else {
5860         if ( aExp == 0x7FFF ) {
5861             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5862                 return propagateFloatx80NaN(a, b, status);
5863             }
5864             return a;
5865         }
5866         zSig1 = 0;
5867         zSig0 = aSig + bSig;
5868         if ( aExp == 0 ) {
5869             if ((aSig | bSig) & UINT64_C(0x8000000000000000) && zSig0 < aSig) {
5870                 /* At least one of the values is a pseudo-denormal,
5871                  * and there is a carry out of the result.  */
5872                 zExp = 1;
5873                 goto shiftRight1;
5874             }
5875             if (zSig0 == 0) {
5876                 return packFloatx80(zSign, 0, 0);
5877             }
5878             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5879             goto roundAndPack;
5880         }
5881         zExp = aExp;
5882         goto shiftRight1;
5883     }
5884     zSig0 = aSig + bSig;
5885     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5886  shiftRight1:
5887     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5888     zSig0 |= UINT64_C(0x8000000000000000);
5889     ++zExp;
5890  roundAndPack:
5891     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5892                                 zSign, zExp, zSig0, zSig1, status);
5893 }
5894 
5895 /*----------------------------------------------------------------------------
5896 | Returns the result of subtracting the absolute values of the extended
5897 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5898 | difference is negated before being returned.  `zSign' is ignored if the
5899 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5900 | Standard for Binary Floating-Point Arithmetic.
5901 *----------------------------------------------------------------------------*/
5902 
5903 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5904                                 float_status *status)
5905 {
5906     int32_t aExp, bExp, zExp;
5907     uint64_t aSig, bSig, zSig0, zSig1;
5908     int32_t expDiff;
5909 
5910     aSig = extractFloatx80Frac( a );
5911     aExp = extractFloatx80Exp( a );
5912     bSig = extractFloatx80Frac( b );
5913     bExp = extractFloatx80Exp( b );
5914     expDiff = aExp - bExp;
5915     if ( 0 < expDiff ) goto aExpBigger;
5916     if ( expDiff < 0 ) goto bExpBigger;
5917     if ( aExp == 0x7FFF ) {
5918         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5919             return propagateFloatx80NaN(a, b, status);
5920         }
5921         float_raise(float_flag_invalid, status);
5922         return floatx80_default_nan(status);
5923     }
5924     if ( aExp == 0 ) {
5925         aExp = 1;
5926         bExp = 1;
5927     }
5928     zSig1 = 0;
5929     if ( bSig < aSig ) goto aBigger;
5930     if ( aSig < bSig ) goto bBigger;
5931     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5932  bExpBigger:
5933     if ( bExp == 0x7FFF ) {
5934         if ((uint64_t)(bSig << 1)) {
5935             return propagateFloatx80NaN(a, b, status);
5936         }
5937         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5938                             floatx80_infinity_low);
5939     }
5940     if ( aExp == 0 ) ++expDiff;
5941     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5942  bBigger:
5943     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5944     zExp = bExp;
5945     zSign ^= 1;
5946     goto normalizeRoundAndPack;
5947  aExpBigger:
5948     if ( aExp == 0x7FFF ) {
5949         if ((uint64_t)(aSig << 1)) {
5950             return propagateFloatx80NaN(a, b, status);
5951         }
5952         return a;
5953     }
5954     if ( bExp == 0 ) --expDiff;
5955     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5956  aBigger:
5957     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5958     zExp = aExp;
5959  normalizeRoundAndPack:
5960     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5961                                          zSign, zExp, zSig0, zSig1, status);
5962 }
5963 
5964 /*----------------------------------------------------------------------------
5965 | Returns the result of adding the extended double-precision floating-point
5966 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5967 | Standard for Binary Floating-Point Arithmetic.
5968 *----------------------------------------------------------------------------*/
5969 
5970 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5971 {
5972     flag aSign, bSign;
5973 
5974     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5975         float_raise(float_flag_invalid, status);
5976         return floatx80_default_nan(status);
5977     }
5978     aSign = extractFloatx80Sign( a );
5979     bSign = extractFloatx80Sign( b );
5980     if ( aSign == bSign ) {
5981         return addFloatx80Sigs(a, b, aSign, status);
5982     }
5983     else {
5984         return subFloatx80Sigs(a, b, aSign, status);
5985     }
5986 
5987 }
5988 
5989 /*----------------------------------------------------------------------------
5990 | Returns the result of subtracting the extended double-precision floating-
5991 | point values `a' and `b'.  The operation is performed according to the
5992 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5993 *----------------------------------------------------------------------------*/
5994 
5995 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5996 {
5997     flag aSign, bSign;
5998 
5999     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6000         float_raise(float_flag_invalid, status);
6001         return floatx80_default_nan(status);
6002     }
6003     aSign = extractFloatx80Sign( a );
6004     bSign = extractFloatx80Sign( b );
6005     if ( aSign == bSign ) {
6006         return subFloatx80Sigs(a, b, aSign, status);
6007     }
6008     else {
6009         return addFloatx80Sigs(a, b, aSign, status);
6010     }
6011 
6012 }
6013 
6014 /*----------------------------------------------------------------------------
6015 | Returns the result of multiplying the extended double-precision floating-
6016 | point values `a' and `b'.  The operation is performed according to the
6017 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6018 *----------------------------------------------------------------------------*/
6019 
6020 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
6021 {
6022     flag aSign, bSign, zSign;
6023     int32_t aExp, bExp, zExp;
6024     uint64_t aSig, bSig, zSig0, zSig1;
6025 
6026     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6027         float_raise(float_flag_invalid, status);
6028         return floatx80_default_nan(status);
6029     }
6030     aSig = extractFloatx80Frac( a );
6031     aExp = extractFloatx80Exp( a );
6032     aSign = extractFloatx80Sign( a );
6033     bSig = extractFloatx80Frac( b );
6034     bExp = extractFloatx80Exp( b );
6035     bSign = extractFloatx80Sign( b );
6036     zSign = aSign ^ bSign;
6037     if ( aExp == 0x7FFF ) {
6038         if (    (uint64_t) ( aSig<<1 )
6039              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6040             return propagateFloatx80NaN(a, b, status);
6041         }
6042         if ( ( bExp | bSig ) == 0 ) goto invalid;
6043         return packFloatx80(zSign, floatx80_infinity_high,
6044                                    floatx80_infinity_low);
6045     }
6046     if ( bExp == 0x7FFF ) {
6047         if ((uint64_t)(bSig << 1)) {
6048             return propagateFloatx80NaN(a, b, status);
6049         }
6050         if ( ( aExp | aSig ) == 0 ) {
6051  invalid:
6052             float_raise(float_flag_invalid, status);
6053             return floatx80_default_nan(status);
6054         }
6055         return packFloatx80(zSign, floatx80_infinity_high,
6056                                    floatx80_infinity_low);
6057     }
6058     if ( aExp == 0 ) {
6059         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6060         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6061     }
6062     if ( bExp == 0 ) {
6063         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6064         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6065     }
6066     zExp = aExp + bExp - 0x3FFE;
6067     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6068     if ( 0 < (int64_t) zSig0 ) {
6069         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6070         --zExp;
6071     }
6072     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6073                                 zSign, zExp, zSig0, zSig1, status);
6074 }
6075 
6076 /*----------------------------------------------------------------------------
6077 | Returns the result of dividing the extended double-precision floating-point
6078 | value `a' by the corresponding value `b'.  The operation is performed
6079 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6080 *----------------------------------------------------------------------------*/
6081 
6082 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6083 {
6084     flag aSign, bSign, zSign;
6085     int32_t aExp, bExp, zExp;
6086     uint64_t aSig, bSig, zSig0, zSig1;
6087     uint64_t rem0, rem1, rem2, term0, term1, term2;
6088 
6089     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6090         float_raise(float_flag_invalid, status);
6091         return floatx80_default_nan(status);
6092     }
6093     aSig = extractFloatx80Frac( a );
6094     aExp = extractFloatx80Exp( a );
6095     aSign = extractFloatx80Sign( a );
6096     bSig = extractFloatx80Frac( b );
6097     bExp = extractFloatx80Exp( b );
6098     bSign = extractFloatx80Sign( b );
6099     zSign = aSign ^ bSign;
6100     if ( aExp == 0x7FFF ) {
6101         if ((uint64_t)(aSig << 1)) {
6102             return propagateFloatx80NaN(a, b, status);
6103         }
6104         if ( bExp == 0x7FFF ) {
6105             if ((uint64_t)(bSig << 1)) {
6106                 return propagateFloatx80NaN(a, b, status);
6107             }
6108             goto invalid;
6109         }
6110         return packFloatx80(zSign, floatx80_infinity_high,
6111                                    floatx80_infinity_low);
6112     }
6113     if ( bExp == 0x7FFF ) {
6114         if ((uint64_t)(bSig << 1)) {
6115             return propagateFloatx80NaN(a, b, status);
6116         }
6117         return packFloatx80( zSign, 0, 0 );
6118     }
6119     if ( bExp == 0 ) {
6120         if ( bSig == 0 ) {
6121             if ( ( aExp | aSig ) == 0 ) {
6122  invalid:
6123                 float_raise(float_flag_invalid, status);
6124                 return floatx80_default_nan(status);
6125             }
6126             float_raise(float_flag_divbyzero, status);
6127             return packFloatx80(zSign, floatx80_infinity_high,
6128                                        floatx80_infinity_low);
6129         }
6130         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6131     }
6132     if ( aExp == 0 ) {
6133         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6134         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6135     }
6136     zExp = aExp - bExp + 0x3FFE;
6137     rem1 = 0;
6138     if ( bSig <= aSig ) {
6139         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6140         ++zExp;
6141     }
6142     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6143     mul64To128( bSig, zSig0, &term0, &term1 );
6144     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6145     while ( (int64_t) rem0 < 0 ) {
6146         --zSig0;
6147         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6148     }
6149     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6150     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6151         mul64To128( bSig, zSig1, &term1, &term2 );
6152         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6153         while ( (int64_t) rem1 < 0 ) {
6154             --zSig1;
6155             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6156         }
6157         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6158     }
6159     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6160                                 zSign, zExp, zSig0, zSig1, status);
6161 }
6162 
6163 /*----------------------------------------------------------------------------
6164 | Returns the remainder of the extended double-precision floating-point value
6165 | `a' with respect to the corresponding value `b'.  The operation is performed
6166 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6167 *----------------------------------------------------------------------------*/
6168 
6169 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6170 {
6171     flag aSign, zSign;
6172     int32_t aExp, bExp, expDiff;
6173     uint64_t aSig0, aSig1, bSig;
6174     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6175 
6176     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6177         float_raise(float_flag_invalid, status);
6178         return floatx80_default_nan(status);
6179     }
6180     aSig0 = extractFloatx80Frac( a );
6181     aExp = extractFloatx80Exp( a );
6182     aSign = extractFloatx80Sign( a );
6183     bSig = extractFloatx80Frac( b );
6184     bExp = extractFloatx80Exp( b );
6185     if ( aExp == 0x7FFF ) {
6186         if (    (uint64_t) ( aSig0<<1 )
6187              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6188             return propagateFloatx80NaN(a, b, status);
6189         }
6190         goto invalid;
6191     }
6192     if ( bExp == 0x7FFF ) {
6193         if ((uint64_t)(bSig << 1)) {
6194             return propagateFloatx80NaN(a, b, status);
6195         }
6196         return a;
6197     }
6198     if ( bExp == 0 ) {
6199         if ( bSig == 0 ) {
6200  invalid:
6201             float_raise(float_flag_invalid, status);
6202             return floatx80_default_nan(status);
6203         }
6204         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6205     }
6206     if ( aExp == 0 ) {
6207         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6208         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6209     }
6210     bSig |= UINT64_C(0x8000000000000000);
6211     zSign = aSign;
6212     expDiff = aExp - bExp;
6213     aSig1 = 0;
6214     if ( expDiff < 0 ) {
6215         if ( expDiff < -1 ) return a;
6216         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6217         expDiff = 0;
6218     }
6219     q = ( bSig <= aSig0 );
6220     if ( q ) aSig0 -= bSig;
6221     expDiff -= 64;
6222     while ( 0 < expDiff ) {
6223         q = estimateDiv128To64( aSig0, aSig1, bSig );
6224         q = ( 2 < q ) ? q - 2 : 0;
6225         mul64To128( bSig, q, &term0, &term1 );
6226         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6227         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6228         expDiff -= 62;
6229     }
6230     expDiff += 64;
6231     if ( 0 < expDiff ) {
6232         q = estimateDiv128To64( aSig0, aSig1, bSig );
6233         q = ( 2 < q ) ? q - 2 : 0;
6234         q >>= 64 - expDiff;
6235         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6236         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6237         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6238         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6239             ++q;
6240             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6241         }
6242     }
6243     else {
6244         term1 = 0;
6245         term0 = bSig;
6246     }
6247     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6248     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6249          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6250               && ( q & 1 ) )
6251        ) {
6252         aSig0 = alternateASig0;
6253         aSig1 = alternateASig1;
6254         zSign = ! zSign;
6255     }
6256     return
6257         normalizeRoundAndPackFloatx80(
6258             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6259 
6260 }
6261 
6262 /*----------------------------------------------------------------------------
6263 | Returns the square root of the extended double-precision floating-point
6264 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6265 | for Binary Floating-Point Arithmetic.
6266 *----------------------------------------------------------------------------*/
6267 
6268 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6269 {
6270     flag aSign;
6271     int32_t aExp, zExp;
6272     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6273     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6274 
6275     if (floatx80_invalid_encoding(a)) {
6276         float_raise(float_flag_invalid, status);
6277         return floatx80_default_nan(status);
6278     }
6279     aSig0 = extractFloatx80Frac( a );
6280     aExp = extractFloatx80Exp( a );
6281     aSign = extractFloatx80Sign( a );
6282     if ( aExp == 0x7FFF ) {
6283         if ((uint64_t)(aSig0 << 1)) {
6284             return propagateFloatx80NaN(a, a, status);
6285         }
6286         if ( ! aSign ) return a;
6287         goto invalid;
6288     }
6289     if ( aSign ) {
6290         if ( ( aExp | aSig0 ) == 0 ) return a;
6291  invalid:
6292         float_raise(float_flag_invalid, status);
6293         return floatx80_default_nan(status);
6294     }
6295     if ( aExp == 0 ) {
6296         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6297         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6298     }
6299     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6300     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6301     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6302     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6303     doubleZSig0 = zSig0<<1;
6304     mul64To128( zSig0, zSig0, &term0, &term1 );
6305     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6306     while ( (int64_t) rem0 < 0 ) {
6307         --zSig0;
6308         doubleZSig0 -= 2;
6309         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6310     }
6311     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6312     if ( ( zSig1 & UINT64_C(0x3FFFFFFFFFFFFFFF) ) <= 5 ) {
6313         if ( zSig1 == 0 ) zSig1 = 1;
6314         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6315         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6316         mul64To128( zSig1, zSig1, &term2, &term3 );
6317         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6318         while ( (int64_t) rem1 < 0 ) {
6319             --zSig1;
6320             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6321             term3 |= 1;
6322             term2 |= doubleZSig0;
6323             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6324         }
6325         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6326     }
6327     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6328     zSig0 |= doubleZSig0;
6329     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6330                                 0, zExp, zSig0, zSig1, status);
6331 }
6332 
6333 /*----------------------------------------------------------------------------
6334 | Returns 1 if the extended double-precision floating-point value `a' is equal
6335 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6336 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6338 *----------------------------------------------------------------------------*/
6339 
6340 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6341 {
6342 
6343     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6344         || (extractFloatx80Exp(a) == 0x7FFF
6345             && (uint64_t) (extractFloatx80Frac(a) << 1))
6346         || (extractFloatx80Exp(b) == 0x7FFF
6347             && (uint64_t) (extractFloatx80Frac(b) << 1))
6348        ) {
6349         float_raise(float_flag_invalid, status);
6350         return 0;
6351     }
6352     return
6353            ( a.low == b.low )
6354         && (    ( a.high == b.high )
6355              || (    ( a.low == 0 )
6356                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6357            );
6358 
6359 }
6360 
6361 /*----------------------------------------------------------------------------
6362 | Returns 1 if the extended double-precision floating-point value `a' is
6363 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6364 | invalid exception is raised if either operand is a NaN.  The comparison is
6365 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6366 | Arithmetic.
6367 *----------------------------------------------------------------------------*/
6368 
6369 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6370 {
6371     flag aSign, bSign;
6372 
6373     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6374         || (extractFloatx80Exp(a) == 0x7FFF
6375             && (uint64_t) (extractFloatx80Frac(a) << 1))
6376         || (extractFloatx80Exp(b) == 0x7FFF
6377             && (uint64_t) (extractFloatx80Frac(b) << 1))
6378        ) {
6379         float_raise(float_flag_invalid, status);
6380         return 0;
6381     }
6382     aSign = extractFloatx80Sign( a );
6383     bSign = extractFloatx80Sign( b );
6384     if ( aSign != bSign ) {
6385         return
6386                aSign
6387             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6388                  == 0 );
6389     }
6390     return
6391           aSign ? le128( b.high, b.low, a.high, a.low )
6392         : le128( a.high, a.low, b.high, b.low );
6393 
6394 }
6395 
6396 /*----------------------------------------------------------------------------
6397 | Returns 1 if the extended double-precision floating-point value `a' is
6398 | less than the corresponding value `b', and 0 otherwise.  The invalid
6399 | exception is raised if either operand is a NaN.  The comparison is performed
6400 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6401 *----------------------------------------------------------------------------*/
6402 
6403 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6404 {
6405     flag aSign, bSign;
6406 
6407     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6408         || (extractFloatx80Exp(a) == 0x7FFF
6409             && (uint64_t) (extractFloatx80Frac(a) << 1))
6410         || (extractFloatx80Exp(b) == 0x7FFF
6411             && (uint64_t) (extractFloatx80Frac(b) << 1))
6412        ) {
6413         float_raise(float_flag_invalid, status);
6414         return 0;
6415     }
6416     aSign = extractFloatx80Sign( a );
6417     bSign = extractFloatx80Sign( b );
6418     if ( aSign != bSign ) {
6419         return
6420                aSign
6421             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6422                  != 0 );
6423     }
6424     return
6425           aSign ? lt128( b.high, b.low, a.high, a.low )
6426         : lt128( a.high, a.low, b.high, b.low );
6427 
6428 }
6429 
6430 /*----------------------------------------------------------------------------
6431 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6432 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6433 | either operand is a NaN.   The comparison is performed according to the
6434 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6435 *----------------------------------------------------------------------------*/
6436 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6437 {
6438     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6439         || (extractFloatx80Exp(a) == 0x7FFF
6440             && (uint64_t) (extractFloatx80Frac(a) << 1))
6441         || (extractFloatx80Exp(b) == 0x7FFF
6442             && (uint64_t) (extractFloatx80Frac(b) << 1))
6443        ) {
6444         float_raise(float_flag_invalid, status);
6445         return 1;
6446     }
6447     return 0;
6448 }
6449 
6450 /*----------------------------------------------------------------------------
6451 | Returns 1 if the extended double-precision floating-point value `a' is
6452 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6453 | cause an exception.  The comparison is performed according to the IEC/IEEE
6454 | Standard for Binary Floating-Point Arithmetic.
6455 *----------------------------------------------------------------------------*/
6456 
6457 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6458 {
6459 
6460     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6461         float_raise(float_flag_invalid, status);
6462         return 0;
6463     }
6464     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6465               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6466          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6467               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6468        ) {
6469         if (floatx80_is_signaling_nan(a, status)
6470          || floatx80_is_signaling_nan(b, status)) {
6471             float_raise(float_flag_invalid, status);
6472         }
6473         return 0;
6474     }
6475     return
6476            ( a.low == b.low )
6477         && (    ( a.high == b.high )
6478              || (    ( a.low == 0 )
6479                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6480            );
6481 
6482 }
6483 
6484 /*----------------------------------------------------------------------------
6485 | Returns 1 if the extended double-precision floating-point value `a' is less
6486 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6487 | do not cause an exception.  Otherwise, the comparison is performed according
6488 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6489 *----------------------------------------------------------------------------*/
6490 
6491 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6492 {
6493     flag aSign, bSign;
6494 
6495     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6496         float_raise(float_flag_invalid, status);
6497         return 0;
6498     }
6499     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6500               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6501          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6502               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6503        ) {
6504         if (floatx80_is_signaling_nan(a, status)
6505          || floatx80_is_signaling_nan(b, status)) {
6506             float_raise(float_flag_invalid, status);
6507         }
6508         return 0;
6509     }
6510     aSign = extractFloatx80Sign( a );
6511     bSign = extractFloatx80Sign( b );
6512     if ( aSign != bSign ) {
6513         return
6514                aSign
6515             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6516                  == 0 );
6517     }
6518     return
6519           aSign ? le128( b.high, b.low, a.high, a.low )
6520         : le128( a.high, a.low, b.high, b.low );
6521 
6522 }
6523 
6524 /*----------------------------------------------------------------------------
6525 | Returns 1 if the extended double-precision floating-point value `a' is less
6526 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6527 | an exception.  Otherwise, the comparison is performed according to the
6528 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6529 *----------------------------------------------------------------------------*/
6530 
6531 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6532 {
6533     flag aSign, bSign;
6534 
6535     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6536         float_raise(float_flag_invalid, status);
6537         return 0;
6538     }
6539     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6540               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6541          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6542               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6543        ) {
6544         if (floatx80_is_signaling_nan(a, status)
6545          || floatx80_is_signaling_nan(b, status)) {
6546             float_raise(float_flag_invalid, status);
6547         }
6548         return 0;
6549     }
6550     aSign = extractFloatx80Sign( a );
6551     bSign = extractFloatx80Sign( b );
6552     if ( aSign != bSign ) {
6553         return
6554                aSign
6555             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6556                  != 0 );
6557     }
6558     return
6559           aSign ? lt128( b.high, b.low, a.high, a.low )
6560         : lt128( a.high, a.low, b.high, b.low );
6561 
6562 }
6563 
6564 /*----------------------------------------------------------------------------
6565 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6566 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6567 | The comparison is performed according to the IEC/IEEE Standard for Binary
6568 | Floating-Point Arithmetic.
6569 *----------------------------------------------------------------------------*/
6570 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6571 {
6572     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6573         float_raise(float_flag_invalid, status);
6574         return 1;
6575     }
6576     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6577               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6578          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6579               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6580        ) {
6581         if (floatx80_is_signaling_nan(a, status)
6582          || floatx80_is_signaling_nan(b, status)) {
6583             float_raise(float_flag_invalid, status);
6584         }
6585         return 1;
6586     }
6587     return 0;
6588 }
6589 
6590 /*----------------------------------------------------------------------------
6591 | Returns the result of converting the quadruple-precision floating-point
6592 | value `a' to the 32-bit two's complement integer format.  The conversion
6593 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6594 | Arithmetic---which means in particular that the conversion is rounded
6595 | according to the current rounding mode.  If `a' is a NaN, the largest
6596 | positive integer is returned.  Otherwise, if the conversion overflows, the
6597 | largest integer with the same sign as `a' is returned.
6598 *----------------------------------------------------------------------------*/
6599 
6600 int32_t float128_to_int32(float128 a, float_status *status)
6601 {
6602     flag aSign;
6603     int32_t aExp, shiftCount;
6604     uint64_t aSig0, aSig1;
6605 
6606     aSig1 = extractFloat128Frac1( a );
6607     aSig0 = extractFloat128Frac0( a );
6608     aExp = extractFloat128Exp( a );
6609     aSign = extractFloat128Sign( a );
6610     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6611     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6612     aSig0 |= ( aSig1 != 0 );
6613     shiftCount = 0x4028 - aExp;
6614     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6615     return roundAndPackInt32(aSign, aSig0, status);
6616 
6617 }
6618 
6619 /*----------------------------------------------------------------------------
6620 | Returns the result of converting the quadruple-precision floating-point
6621 | value `a' to the 32-bit two's complement integer format.  The conversion
6622 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6623 | Arithmetic, except that the conversion is always rounded toward zero.  If
6624 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6625 | conversion overflows, the largest integer with the same sign as `a' is
6626 | returned.
6627 *----------------------------------------------------------------------------*/
6628 
6629 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6630 {
6631     flag aSign;
6632     int32_t aExp, shiftCount;
6633     uint64_t aSig0, aSig1, savedASig;
6634     int32_t z;
6635 
6636     aSig1 = extractFloat128Frac1( a );
6637     aSig0 = extractFloat128Frac0( a );
6638     aExp = extractFloat128Exp( a );
6639     aSign = extractFloat128Sign( a );
6640     aSig0 |= ( aSig1 != 0 );
6641     if ( 0x401E < aExp ) {
6642         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6643         goto invalid;
6644     }
6645     else if ( aExp < 0x3FFF ) {
6646         if (aExp || aSig0) {
6647             status->float_exception_flags |= float_flag_inexact;
6648         }
6649         return 0;
6650     }
6651     aSig0 |= UINT64_C(0x0001000000000000);
6652     shiftCount = 0x402F - aExp;
6653     savedASig = aSig0;
6654     aSig0 >>= shiftCount;
6655     z = aSig0;
6656     if ( aSign ) z = - z;
6657     if ( ( z < 0 ) ^ aSign ) {
6658  invalid:
6659         float_raise(float_flag_invalid, status);
6660         return aSign ? INT32_MIN : INT32_MAX;
6661     }
6662     if ( ( aSig0<<shiftCount ) != savedASig ) {
6663         status->float_exception_flags |= float_flag_inexact;
6664     }
6665     return z;
6666 
6667 }
6668 
6669 /*----------------------------------------------------------------------------
6670 | Returns the result of converting the quadruple-precision floating-point
6671 | value `a' to the 64-bit two's complement integer format.  The conversion
6672 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6673 | Arithmetic---which means in particular that the conversion is rounded
6674 | according to the current rounding mode.  If `a' is a NaN, the largest
6675 | positive integer is returned.  Otherwise, if the conversion overflows, the
6676 | largest integer with the same sign as `a' is returned.
6677 *----------------------------------------------------------------------------*/
6678 
6679 int64_t float128_to_int64(float128 a, float_status *status)
6680 {
6681     flag aSign;
6682     int32_t aExp, shiftCount;
6683     uint64_t aSig0, aSig1;
6684 
6685     aSig1 = extractFloat128Frac1( a );
6686     aSig0 = extractFloat128Frac0( a );
6687     aExp = extractFloat128Exp( a );
6688     aSign = extractFloat128Sign( a );
6689     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6690     shiftCount = 0x402F - aExp;
6691     if ( shiftCount <= 0 ) {
6692         if ( 0x403E < aExp ) {
6693             float_raise(float_flag_invalid, status);
6694             if (    ! aSign
6695                  || (    ( aExp == 0x7FFF )
6696                       && ( aSig1 || ( aSig0 != UINT64_C(0x0001000000000000) ) )
6697                     )
6698                ) {
6699                 return INT64_MAX;
6700             }
6701             return INT64_MIN;
6702         }
6703         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6704     }
6705     else {
6706         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6707     }
6708     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6709 
6710 }
6711 
6712 /*----------------------------------------------------------------------------
6713 | Returns the result of converting the quadruple-precision floating-point
6714 | value `a' to the 64-bit two's complement integer format.  The conversion
6715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6716 | Arithmetic, except that the conversion is always rounded toward zero.
6717 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6718 | the conversion overflows, the largest integer with the same sign as `a' is
6719 | returned.
6720 *----------------------------------------------------------------------------*/
6721 
6722 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6723 {
6724     flag aSign;
6725     int32_t aExp, shiftCount;
6726     uint64_t aSig0, aSig1;
6727     int64_t z;
6728 
6729     aSig1 = extractFloat128Frac1( a );
6730     aSig0 = extractFloat128Frac0( a );
6731     aExp = extractFloat128Exp( a );
6732     aSign = extractFloat128Sign( a );
6733     if ( aExp ) aSig0 |= UINT64_C(0x0001000000000000);
6734     shiftCount = aExp - 0x402F;
6735     if ( 0 < shiftCount ) {
6736         if ( 0x403E <= aExp ) {
6737             aSig0 &= UINT64_C(0x0000FFFFFFFFFFFF);
6738             if (    ( a.high == UINT64_C(0xC03E000000000000) )
6739                  && ( aSig1 < UINT64_C(0x0002000000000000) ) ) {
6740                 if (aSig1) {
6741                     status->float_exception_flags |= float_flag_inexact;
6742                 }
6743             }
6744             else {
6745                 float_raise(float_flag_invalid, status);
6746                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6747                     return INT64_MAX;
6748                 }
6749             }
6750             return INT64_MIN;
6751         }
6752         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6753         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6754             status->float_exception_flags |= float_flag_inexact;
6755         }
6756     }
6757     else {
6758         if ( aExp < 0x3FFF ) {
6759             if ( aExp | aSig0 | aSig1 ) {
6760                 status->float_exception_flags |= float_flag_inexact;
6761             }
6762             return 0;
6763         }
6764         z = aSig0>>( - shiftCount );
6765         if (    aSig1
6766              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6767             status->float_exception_flags |= float_flag_inexact;
6768         }
6769     }
6770     if ( aSign ) z = - z;
6771     return z;
6772 
6773 }
6774 
6775 /*----------------------------------------------------------------------------
6776 | Returns the result of converting the quadruple-precision floating-point value
6777 | `a' to the 64-bit unsigned integer format.  The conversion is
6778 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6779 | Arithmetic---which means in particular that the conversion is rounded
6780 | according to the current rounding mode.  If `a' is a NaN, the largest
6781 | positive integer is returned.  If the conversion overflows, the
6782 | largest unsigned integer is returned.  If 'a' is negative, the value is
6783 | rounded and zero is returned; negative values that do not round to zero
6784 | will raise the inexact exception.
6785 *----------------------------------------------------------------------------*/
6786 
6787 uint64_t float128_to_uint64(float128 a, float_status *status)
6788 {
6789     flag aSign;
6790     int aExp;
6791     int shiftCount;
6792     uint64_t aSig0, aSig1;
6793 
6794     aSig0 = extractFloat128Frac0(a);
6795     aSig1 = extractFloat128Frac1(a);
6796     aExp = extractFloat128Exp(a);
6797     aSign = extractFloat128Sign(a);
6798     if (aSign && (aExp > 0x3FFE)) {
6799         float_raise(float_flag_invalid, status);
6800         if (float128_is_any_nan(a)) {
6801             return UINT64_MAX;
6802         } else {
6803             return 0;
6804         }
6805     }
6806     if (aExp) {
6807         aSig0 |= UINT64_C(0x0001000000000000);
6808     }
6809     shiftCount = 0x402F - aExp;
6810     if (shiftCount <= 0) {
6811         if (0x403E < aExp) {
6812             float_raise(float_flag_invalid, status);
6813             return UINT64_MAX;
6814         }
6815         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6816     } else {
6817         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6818     }
6819     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6820 }
6821 
6822 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6823 {
6824     uint64_t v;
6825     signed char current_rounding_mode = status->float_rounding_mode;
6826 
6827     set_float_rounding_mode(float_round_to_zero, status);
6828     v = float128_to_uint64(a, status);
6829     set_float_rounding_mode(current_rounding_mode, status);
6830 
6831     return v;
6832 }
6833 
6834 /*----------------------------------------------------------------------------
6835 | Returns the result of converting the quadruple-precision floating-point
6836 | value `a' to the 32-bit unsigned integer format.  The conversion
6837 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6838 | Arithmetic except that the conversion is always rounded toward zero.
6839 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6840 | if the conversion overflows, the largest unsigned integer is returned.
6841 | If 'a' is negative, the value is rounded and zero is returned; negative
6842 | values that do not round to zero will raise the inexact exception.
6843 *----------------------------------------------------------------------------*/
6844 
6845 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6846 {
6847     uint64_t v;
6848     uint32_t res;
6849     int old_exc_flags = get_float_exception_flags(status);
6850 
6851     v = float128_to_uint64_round_to_zero(a, status);
6852     if (v > 0xffffffff) {
6853         res = 0xffffffff;
6854     } else {
6855         return v;
6856     }
6857     set_float_exception_flags(old_exc_flags, status);
6858     float_raise(float_flag_invalid, status);
6859     return res;
6860 }
6861 
6862 /*----------------------------------------------------------------------------
6863 | Returns the result of converting the quadruple-precision floating-point value
6864 | `a' to the 32-bit unsigned integer format.  The conversion is
6865 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6866 | Arithmetic---which means in particular that the conversion is rounded
6867 | according to the current rounding mode.  If `a' is a NaN, the largest
6868 | positive integer is returned.  If the conversion overflows, the
6869 | largest unsigned integer is returned.  If 'a' is negative, the value is
6870 | rounded and zero is returned; negative values that do not round to zero
6871 | will raise the inexact exception.
6872 *----------------------------------------------------------------------------*/
6873 
6874 uint32_t float128_to_uint32(float128 a, float_status *status)
6875 {
6876     uint64_t v;
6877     uint32_t res;
6878     int old_exc_flags = get_float_exception_flags(status);
6879 
6880     v = float128_to_uint64(a, status);
6881     if (v > 0xffffffff) {
6882         res = 0xffffffff;
6883     } else {
6884         return v;
6885     }
6886     set_float_exception_flags(old_exc_flags, status);
6887     float_raise(float_flag_invalid, status);
6888     return res;
6889 }
6890 
6891 /*----------------------------------------------------------------------------
6892 | Returns the result of converting the quadruple-precision floating-point
6893 | value `a' to the single-precision floating-point format.  The conversion
6894 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6895 | Arithmetic.
6896 *----------------------------------------------------------------------------*/
6897 
6898 float32 float128_to_float32(float128 a, float_status *status)
6899 {
6900     flag aSign;
6901     int32_t aExp;
6902     uint64_t aSig0, aSig1;
6903     uint32_t zSig;
6904 
6905     aSig1 = extractFloat128Frac1( a );
6906     aSig0 = extractFloat128Frac0( a );
6907     aExp = extractFloat128Exp( a );
6908     aSign = extractFloat128Sign( a );
6909     if ( aExp == 0x7FFF ) {
6910         if ( aSig0 | aSig1 ) {
6911             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6912         }
6913         return packFloat32( aSign, 0xFF, 0 );
6914     }
6915     aSig0 |= ( aSig1 != 0 );
6916     shift64RightJamming( aSig0, 18, &aSig0 );
6917     zSig = aSig0;
6918     if ( aExp || zSig ) {
6919         zSig |= 0x40000000;
6920         aExp -= 0x3F81;
6921     }
6922     return roundAndPackFloat32(aSign, aExp, zSig, status);
6923 
6924 }
6925 
6926 /*----------------------------------------------------------------------------
6927 | Returns the result of converting the quadruple-precision floating-point
6928 | value `a' to the double-precision floating-point format.  The conversion
6929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6930 | Arithmetic.
6931 *----------------------------------------------------------------------------*/
6932 
6933 float64 float128_to_float64(float128 a, float_status *status)
6934 {
6935     flag aSign;
6936     int32_t aExp;
6937     uint64_t aSig0, aSig1;
6938 
6939     aSig1 = extractFloat128Frac1( a );
6940     aSig0 = extractFloat128Frac0( a );
6941     aExp = extractFloat128Exp( a );
6942     aSign = extractFloat128Sign( a );
6943     if ( aExp == 0x7FFF ) {
6944         if ( aSig0 | aSig1 ) {
6945             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6946         }
6947         return packFloat64( aSign, 0x7FF, 0 );
6948     }
6949     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6950     aSig0 |= ( aSig1 != 0 );
6951     if ( aExp || aSig0 ) {
6952         aSig0 |= UINT64_C(0x4000000000000000);
6953         aExp -= 0x3C01;
6954     }
6955     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6956 
6957 }
6958 
6959 /*----------------------------------------------------------------------------
6960 | Returns the result of converting the quadruple-precision floating-point
6961 | value `a' to the extended double-precision floating-point format.  The
6962 | conversion is performed according to the IEC/IEEE Standard for Binary
6963 | Floating-Point Arithmetic.
6964 *----------------------------------------------------------------------------*/
6965 
6966 floatx80 float128_to_floatx80(float128 a, float_status *status)
6967 {
6968     flag aSign;
6969     int32_t aExp;
6970     uint64_t aSig0, aSig1;
6971 
6972     aSig1 = extractFloat128Frac1( a );
6973     aSig0 = extractFloat128Frac0( a );
6974     aExp = extractFloat128Exp( a );
6975     aSign = extractFloat128Sign( a );
6976     if ( aExp == 0x7FFF ) {
6977         if ( aSig0 | aSig1 ) {
6978             floatx80 res = commonNaNToFloatx80(float128ToCommonNaN(a, status),
6979                                                status);
6980             return floatx80_silence_nan(res, status);
6981         }
6982         return packFloatx80(aSign, floatx80_infinity_high,
6983                                    floatx80_infinity_low);
6984     }
6985     if ( aExp == 0 ) {
6986         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6987         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6988     }
6989     else {
6990         aSig0 |= UINT64_C(0x0001000000000000);
6991     }
6992     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6993     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6994 
6995 }
6996 
6997 /*----------------------------------------------------------------------------
6998 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6999 | returns the result as a quadruple-precision floating-point value.  The
7000 | operation is performed according to the IEC/IEEE Standard for Binary
7001 | Floating-Point Arithmetic.
7002 *----------------------------------------------------------------------------*/
7003 
7004 float128 float128_round_to_int(float128 a, float_status *status)
7005 {
7006     flag aSign;
7007     int32_t aExp;
7008     uint64_t lastBitMask, roundBitsMask;
7009     float128 z;
7010 
7011     aExp = extractFloat128Exp( a );
7012     if ( 0x402F <= aExp ) {
7013         if ( 0x406F <= aExp ) {
7014             if (    ( aExp == 0x7FFF )
7015                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
7016                ) {
7017                 return propagateFloat128NaN(a, a, status);
7018             }
7019             return a;
7020         }
7021         lastBitMask = 1;
7022         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
7023         roundBitsMask = lastBitMask - 1;
7024         z = a;
7025         switch (status->float_rounding_mode) {
7026         case float_round_nearest_even:
7027             if ( lastBitMask ) {
7028                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
7029                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
7030             }
7031             else {
7032                 if ( (int64_t) z.low < 0 ) {
7033                     ++z.high;
7034                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7035                 }
7036             }
7037             break;
7038         case float_round_ties_away:
7039             if (lastBitMask) {
7040                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7041             } else {
7042                 if ((int64_t) z.low < 0) {
7043                     ++z.high;
7044                 }
7045             }
7046             break;
7047         case float_round_to_zero:
7048             break;
7049         case float_round_up:
7050             if (!extractFloat128Sign(z)) {
7051                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7052             }
7053             break;
7054         case float_round_down:
7055             if (extractFloat128Sign(z)) {
7056                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7057             }
7058             break;
7059         case float_round_to_odd:
7060             /*
7061              * Note that if lastBitMask == 0, the last bit is the lsb
7062              * of high, and roundBitsMask == -1.
7063              */
7064             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7065                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7066             }
7067             break;
7068         default:
7069             abort();
7070         }
7071         z.low &= ~ roundBitsMask;
7072     }
7073     else {
7074         if ( aExp < 0x3FFF ) {
7075             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7076             status->float_exception_flags |= float_flag_inexact;
7077             aSign = extractFloat128Sign( a );
7078             switch (status->float_rounding_mode) {
7079             case float_round_nearest_even:
7080                 if (    ( aExp == 0x3FFE )
7081                      && (   extractFloat128Frac0( a )
7082                           | extractFloat128Frac1( a ) )
7083                    ) {
7084                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7085                 }
7086                 break;
7087             case float_round_ties_away:
7088                 if (aExp == 0x3FFE) {
7089                     return packFloat128(aSign, 0x3FFF, 0, 0);
7090                 }
7091                 break;
7092             case float_round_down:
7093                 return
7094                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7095                     : packFloat128( 0, 0, 0, 0 );
7096             case float_round_up:
7097                 return
7098                       aSign ? packFloat128( 1, 0, 0, 0 )
7099                     : packFloat128( 0, 0x3FFF, 0, 0 );
7100 
7101             case float_round_to_odd:
7102                 return packFloat128(aSign, 0x3FFF, 0, 0);
7103             }
7104             return packFloat128( aSign, 0, 0, 0 );
7105         }
7106         lastBitMask = 1;
7107         lastBitMask <<= 0x402F - aExp;
7108         roundBitsMask = lastBitMask - 1;
7109         z.low = 0;
7110         z.high = a.high;
7111         switch (status->float_rounding_mode) {
7112         case float_round_nearest_even:
7113             z.high += lastBitMask>>1;
7114             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7115                 z.high &= ~ lastBitMask;
7116             }
7117             break;
7118         case float_round_ties_away:
7119             z.high += lastBitMask>>1;
7120             break;
7121         case float_round_to_zero:
7122             break;
7123         case float_round_up:
7124             if (!extractFloat128Sign(z)) {
7125                 z.high |= ( a.low != 0 );
7126                 z.high += roundBitsMask;
7127             }
7128             break;
7129         case float_round_down:
7130             if (extractFloat128Sign(z)) {
7131                 z.high |= (a.low != 0);
7132                 z.high += roundBitsMask;
7133             }
7134             break;
7135         case float_round_to_odd:
7136             if ((z.high & lastBitMask) == 0) {
7137                 z.high |= (a.low != 0);
7138                 z.high += roundBitsMask;
7139             }
7140             break;
7141         default:
7142             abort();
7143         }
7144         z.high &= ~ roundBitsMask;
7145     }
7146     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7147         status->float_exception_flags |= float_flag_inexact;
7148     }
7149     return z;
7150 
7151 }
7152 
7153 /*----------------------------------------------------------------------------
7154 | Returns the result of adding the absolute values of the quadruple-precision
7155 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7156 | before being returned.  `zSign' is ignored if the result is a NaN.
7157 | The addition is performed according to the IEC/IEEE Standard for Binary
7158 | Floating-Point Arithmetic.
7159 *----------------------------------------------------------------------------*/
7160 
7161 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7162                                 float_status *status)
7163 {
7164     int32_t aExp, bExp, zExp;
7165     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7166     int32_t expDiff;
7167 
7168     aSig1 = extractFloat128Frac1( a );
7169     aSig0 = extractFloat128Frac0( a );
7170     aExp = extractFloat128Exp( a );
7171     bSig1 = extractFloat128Frac1( b );
7172     bSig0 = extractFloat128Frac0( b );
7173     bExp = extractFloat128Exp( b );
7174     expDiff = aExp - bExp;
7175     if ( 0 < expDiff ) {
7176         if ( aExp == 0x7FFF ) {
7177             if (aSig0 | aSig1) {
7178                 return propagateFloat128NaN(a, b, status);
7179             }
7180             return a;
7181         }
7182         if ( bExp == 0 ) {
7183             --expDiff;
7184         }
7185         else {
7186             bSig0 |= UINT64_C(0x0001000000000000);
7187         }
7188         shift128ExtraRightJamming(
7189             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7190         zExp = aExp;
7191     }
7192     else if ( expDiff < 0 ) {
7193         if ( bExp == 0x7FFF ) {
7194             if (bSig0 | bSig1) {
7195                 return propagateFloat128NaN(a, b, status);
7196             }
7197             return packFloat128( zSign, 0x7FFF, 0, 0 );
7198         }
7199         if ( aExp == 0 ) {
7200             ++expDiff;
7201         }
7202         else {
7203             aSig0 |= UINT64_C(0x0001000000000000);
7204         }
7205         shift128ExtraRightJamming(
7206             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7207         zExp = bExp;
7208     }
7209     else {
7210         if ( aExp == 0x7FFF ) {
7211             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7212                 return propagateFloat128NaN(a, b, status);
7213             }
7214             return a;
7215         }
7216         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7217         if ( aExp == 0 ) {
7218             if (status->flush_to_zero) {
7219                 if (zSig0 | zSig1) {
7220                     float_raise(float_flag_output_denormal, status);
7221                 }
7222                 return packFloat128(zSign, 0, 0, 0);
7223             }
7224             return packFloat128( zSign, 0, zSig0, zSig1 );
7225         }
7226         zSig2 = 0;
7227         zSig0 |= UINT64_C(0x0002000000000000);
7228         zExp = aExp;
7229         goto shiftRight1;
7230     }
7231     aSig0 |= UINT64_C(0x0001000000000000);
7232     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7233     --zExp;
7234     if ( zSig0 < UINT64_C(0x0002000000000000) ) goto roundAndPack;
7235     ++zExp;
7236  shiftRight1:
7237     shift128ExtraRightJamming(
7238         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7239  roundAndPack:
7240     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7241 
7242 }
7243 
7244 /*----------------------------------------------------------------------------
7245 | Returns the result of subtracting the absolute values of the quadruple-
7246 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7247 | difference is negated before being returned.  `zSign' is ignored if the
7248 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7249 | Standard for Binary Floating-Point Arithmetic.
7250 *----------------------------------------------------------------------------*/
7251 
7252 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7253                                 float_status *status)
7254 {
7255     int32_t aExp, bExp, zExp;
7256     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7257     int32_t expDiff;
7258 
7259     aSig1 = extractFloat128Frac1( a );
7260     aSig0 = extractFloat128Frac0( a );
7261     aExp = extractFloat128Exp( a );
7262     bSig1 = extractFloat128Frac1( b );
7263     bSig0 = extractFloat128Frac0( b );
7264     bExp = extractFloat128Exp( b );
7265     expDiff = aExp - bExp;
7266     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7267     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7268     if ( 0 < expDiff ) goto aExpBigger;
7269     if ( expDiff < 0 ) goto bExpBigger;
7270     if ( aExp == 0x7FFF ) {
7271         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7272             return propagateFloat128NaN(a, b, status);
7273         }
7274         float_raise(float_flag_invalid, status);
7275         return float128_default_nan(status);
7276     }
7277     if ( aExp == 0 ) {
7278         aExp = 1;
7279         bExp = 1;
7280     }
7281     if ( bSig0 < aSig0 ) goto aBigger;
7282     if ( aSig0 < bSig0 ) goto bBigger;
7283     if ( bSig1 < aSig1 ) goto aBigger;
7284     if ( aSig1 < bSig1 ) goto bBigger;
7285     return packFloat128(status->float_rounding_mode == float_round_down,
7286                         0, 0, 0);
7287  bExpBigger:
7288     if ( bExp == 0x7FFF ) {
7289         if (bSig0 | bSig1) {
7290             return propagateFloat128NaN(a, b, status);
7291         }
7292         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7293     }
7294     if ( aExp == 0 ) {
7295         ++expDiff;
7296     }
7297     else {
7298         aSig0 |= UINT64_C(0x4000000000000000);
7299     }
7300     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7301     bSig0 |= UINT64_C(0x4000000000000000);
7302  bBigger:
7303     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7304     zExp = bExp;
7305     zSign ^= 1;
7306     goto normalizeRoundAndPack;
7307  aExpBigger:
7308     if ( aExp == 0x7FFF ) {
7309         if (aSig0 | aSig1) {
7310             return propagateFloat128NaN(a, b, status);
7311         }
7312         return a;
7313     }
7314     if ( bExp == 0 ) {
7315         --expDiff;
7316     }
7317     else {
7318         bSig0 |= UINT64_C(0x4000000000000000);
7319     }
7320     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7321     aSig0 |= UINT64_C(0x4000000000000000);
7322  aBigger:
7323     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7324     zExp = aExp;
7325  normalizeRoundAndPack:
7326     --zExp;
7327     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7328                                          status);
7329 
7330 }
7331 
7332 /*----------------------------------------------------------------------------
7333 | Returns the result of adding the quadruple-precision floating-point values
7334 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7335 | for Binary Floating-Point Arithmetic.
7336 *----------------------------------------------------------------------------*/
7337 
7338 float128 float128_add(float128 a, float128 b, float_status *status)
7339 {
7340     flag aSign, bSign;
7341 
7342     aSign = extractFloat128Sign( a );
7343     bSign = extractFloat128Sign( b );
7344     if ( aSign == bSign ) {
7345         return addFloat128Sigs(a, b, aSign, status);
7346     }
7347     else {
7348         return subFloat128Sigs(a, b, aSign, status);
7349     }
7350 
7351 }
7352 
7353 /*----------------------------------------------------------------------------
7354 | Returns the result of subtracting the quadruple-precision floating-point
7355 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7356 | Standard for Binary Floating-Point Arithmetic.
7357 *----------------------------------------------------------------------------*/
7358 
7359 float128 float128_sub(float128 a, float128 b, float_status *status)
7360 {
7361     flag aSign, bSign;
7362 
7363     aSign = extractFloat128Sign( a );
7364     bSign = extractFloat128Sign( b );
7365     if ( aSign == bSign ) {
7366         return subFloat128Sigs(a, b, aSign, status);
7367     }
7368     else {
7369         return addFloat128Sigs(a, b, aSign, status);
7370     }
7371 
7372 }
7373 
7374 /*----------------------------------------------------------------------------
7375 | Returns the result of multiplying the quadruple-precision floating-point
7376 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7377 | Standard for Binary Floating-Point Arithmetic.
7378 *----------------------------------------------------------------------------*/
7379 
7380 float128 float128_mul(float128 a, float128 b, float_status *status)
7381 {
7382     flag aSign, bSign, zSign;
7383     int32_t aExp, bExp, zExp;
7384     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7385 
7386     aSig1 = extractFloat128Frac1( a );
7387     aSig0 = extractFloat128Frac0( a );
7388     aExp = extractFloat128Exp( a );
7389     aSign = extractFloat128Sign( a );
7390     bSig1 = extractFloat128Frac1( b );
7391     bSig0 = extractFloat128Frac0( b );
7392     bExp = extractFloat128Exp( b );
7393     bSign = extractFloat128Sign( b );
7394     zSign = aSign ^ bSign;
7395     if ( aExp == 0x7FFF ) {
7396         if (    ( aSig0 | aSig1 )
7397              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7398             return propagateFloat128NaN(a, b, status);
7399         }
7400         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7401         return packFloat128( zSign, 0x7FFF, 0, 0 );
7402     }
7403     if ( bExp == 0x7FFF ) {
7404         if (bSig0 | bSig1) {
7405             return propagateFloat128NaN(a, b, status);
7406         }
7407         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7408  invalid:
7409             float_raise(float_flag_invalid, status);
7410             return float128_default_nan(status);
7411         }
7412         return packFloat128( zSign, 0x7FFF, 0, 0 );
7413     }
7414     if ( aExp == 0 ) {
7415         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7416         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7417     }
7418     if ( bExp == 0 ) {
7419         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7420         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7421     }
7422     zExp = aExp + bExp - 0x4000;
7423     aSig0 |= UINT64_C(0x0001000000000000);
7424     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7425     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7426     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7427     zSig2 |= ( zSig3 != 0 );
7428     if (UINT64_C( 0x0002000000000000) <= zSig0 ) {
7429         shift128ExtraRightJamming(
7430             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7431         ++zExp;
7432     }
7433     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7434 
7435 }
7436 
7437 /*----------------------------------------------------------------------------
7438 | Returns the result of dividing the quadruple-precision floating-point value
7439 | `a' by the corresponding value `b'.  The operation is performed according to
7440 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7441 *----------------------------------------------------------------------------*/
7442 
7443 float128 float128_div(float128 a, float128 b, float_status *status)
7444 {
7445     flag aSign, bSign, zSign;
7446     int32_t aExp, bExp, zExp;
7447     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7448     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7449 
7450     aSig1 = extractFloat128Frac1( a );
7451     aSig0 = extractFloat128Frac0( a );
7452     aExp = extractFloat128Exp( a );
7453     aSign = extractFloat128Sign( a );
7454     bSig1 = extractFloat128Frac1( b );
7455     bSig0 = extractFloat128Frac0( b );
7456     bExp = extractFloat128Exp( b );
7457     bSign = extractFloat128Sign( b );
7458     zSign = aSign ^ bSign;
7459     if ( aExp == 0x7FFF ) {
7460         if (aSig0 | aSig1) {
7461             return propagateFloat128NaN(a, b, status);
7462         }
7463         if ( bExp == 0x7FFF ) {
7464             if (bSig0 | bSig1) {
7465                 return propagateFloat128NaN(a, b, status);
7466             }
7467             goto invalid;
7468         }
7469         return packFloat128( zSign, 0x7FFF, 0, 0 );
7470     }
7471     if ( bExp == 0x7FFF ) {
7472         if (bSig0 | bSig1) {
7473             return propagateFloat128NaN(a, b, status);
7474         }
7475         return packFloat128( zSign, 0, 0, 0 );
7476     }
7477     if ( bExp == 0 ) {
7478         if ( ( bSig0 | bSig1 ) == 0 ) {
7479             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7480  invalid:
7481                 float_raise(float_flag_invalid, status);
7482                 return float128_default_nan(status);
7483             }
7484             float_raise(float_flag_divbyzero, status);
7485             return packFloat128( zSign, 0x7FFF, 0, 0 );
7486         }
7487         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7488     }
7489     if ( aExp == 0 ) {
7490         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7491         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7492     }
7493     zExp = aExp - bExp + 0x3FFD;
7494     shortShift128Left(
7495         aSig0 | UINT64_C(0x0001000000000000), aSig1, 15, &aSig0, &aSig1 );
7496     shortShift128Left(
7497         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7498     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7499         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7500         ++zExp;
7501     }
7502     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7503     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7504     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7505     while ( (int64_t) rem0 < 0 ) {
7506         --zSig0;
7507         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7508     }
7509     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7510     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7511         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7512         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7513         while ( (int64_t) rem1 < 0 ) {
7514             --zSig1;
7515             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7516         }
7517         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7518     }
7519     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7520     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7521 
7522 }
7523 
7524 /*----------------------------------------------------------------------------
7525 | Returns the remainder of the quadruple-precision floating-point value `a'
7526 | with respect to the corresponding value `b'.  The operation is performed
7527 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7528 *----------------------------------------------------------------------------*/
7529 
7530 float128 float128_rem(float128 a, float128 b, float_status *status)
7531 {
7532     flag aSign, zSign;
7533     int32_t aExp, bExp, expDiff;
7534     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7535     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7536     int64_t sigMean0;
7537 
7538     aSig1 = extractFloat128Frac1( a );
7539     aSig0 = extractFloat128Frac0( a );
7540     aExp = extractFloat128Exp( a );
7541     aSign = extractFloat128Sign( a );
7542     bSig1 = extractFloat128Frac1( b );
7543     bSig0 = extractFloat128Frac0( b );
7544     bExp = extractFloat128Exp( b );
7545     if ( aExp == 0x7FFF ) {
7546         if (    ( aSig0 | aSig1 )
7547              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7548             return propagateFloat128NaN(a, b, status);
7549         }
7550         goto invalid;
7551     }
7552     if ( bExp == 0x7FFF ) {
7553         if (bSig0 | bSig1) {
7554             return propagateFloat128NaN(a, b, status);
7555         }
7556         return a;
7557     }
7558     if ( bExp == 0 ) {
7559         if ( ( bSig0 | bSig1 ) == 0 ) {
7560  invalid:
7561             float_raise(float_flag_invalid, status);
7562             return float128_default_nan(status);
7563         }
7564         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7565     }
7566     if ( aExp == 0 ) {
7567         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7568         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7569     }
7570     expDiff = aExp - bExp;
7571     if ( expDiff < -1 ) return a;
7572     shortShift128Left(
7573         aSig0 | UINT64_C(0x0001000000000000),
7574         aSig1,
7575         15 - ( expDiff < 0 ),
7576         &aSig0,
7577         &aSig1
7578     );
7579     shortShift128Left(
7580         bSig0 | UINT64_C(0x0001000000000000), bSig1, 15, &bSig0, &bSig1 );
7581     q = le128( bSig0, bSig1, aSig0, aSig1 );
7582     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7583     expDiff -= 64;
7584     while ( 0 < expDiff ) {
7585         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7586         q = ( 4 < q ) ? q - 4 : 0;
7587         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7588         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7589         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7590         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7591         expDiff -= 61;
7592     }
7593     if ( -64 < expDiff ) {
7594         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7595         q = ( 4 < q ) ? q - 4 : 0;
7596         q >>= - expDiff;
7597         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7598         expDiff += 52;
7599         if ( expDiff < 0 ) {
7600             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7601         }
7602         else {
7603             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7604         }
7605         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7606         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7607     }
7608     else {
7609         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7610         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7611     }
7612     do {
7613         alternateASig0 = aSig0;
7614         alternateASig1 = aSig1;
7615         ++q;
7616         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7617     } while ( 0 <= (int64_t) aSig0 );
7618     add128(
7619         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7620     if (    ( sigMean0 < 0 )
7621          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7622         aSig0 = alternateASig0;
7623         aSig1 = alternateASig1;
7624     }
7625     zSign = ( (int64_t) aSig0 < 0 );
7626     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7627     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7628                                          status);
7629 }
7630 
7631 /*----------------------------------------------------------------------------
7632 | Returns the square root of the quadruple-precision floating-point value `a'.
7633 | The operation is performed according to the IEC/IEEE Standard for Binary
7634 | Floating-Point Arithmetic.
7635 *----------------------------------------------------------------------------*/
7636 
7637 float128 float128_sqrt(float128 a, float_status *status)
7638 {
7639     flag aSign;
7640     int32_t aExp, zExp;
7641     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7642     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7643 
7644     aSig1 = extractFloat128Frac1( a );
7645     aSig0 = extractFloat128Frac0( a );
7646     aExp = extractFloat128Exp( a );
7647     aSign = extractFloat128Sign( a );
7648     if ( aExp == 0x7FFF ) {
7649         if (aSig0 | aSig1) {
7650             return propagateFloat128NaN(a, a, status);
7651         }
7652         if ( ! aSign ) return a;
7653         goto invalid;
7654     }
7655     if ( aSign ) {
7656         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7657  invalid:
7658         float_raise(float_flag_invalid, status);
7659         return float128_default_nan(status);
7660     }
7661     if ( aExp == 0 ) {
7662         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7663         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7664     }
7665     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7666     aSig0 |= UINT64_C(0x0001000000000000);
7667     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7668     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7669     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7670     doubleZSig0 = zSig0<<1;
7671     mul64To128( zSig0, zSig0, &term0, &term1 );
7672     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7673     while ( (int64_t) rem0 < 0 ) {
7674         --zSig0;
7675         doubleZSig0 -= 2;
7676         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7677     }
7678     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7679     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7680         if ( zSig1 == 0 ) zSig1 = 1;
7681         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7682         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7683         mul64To128( zSig1, zSig1, &term2, &term3 );
7684         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7685         while ( (int64_t) rem1 < 0 ) {
7686             --zSig1;
7687             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7688             term3 |= 1;
7689             term2 |= doubleZSig0;
7690             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7691         }
7692         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7693     }
7694     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7695     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7696 
7697 }
7698 
7699 /*----------------------------------------------------------------------------
7700 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7701 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7702 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7703 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7704 *----------------------------------------------------------------------------*/
7705 
7706 int float128_eq(float128 a, float128 b, float_status *status)
7707 {
7708 
7709     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7710               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7711          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7712               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7713        ) {
7714         float_raise(float_flag_invalid, status);
7715         return 0;
7716     }
7717     return
7718            ( a.low == b.low )
7719         && (    ( a.high == b.high )
7720              || (    ( a.low == 0 )
7721                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7722            );
7723 
7724 }
7725 
7726 /*----------------------------------------------------------------------------
7727 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7728 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7729 | exception is raised if either operand is a NaN.  The comparison is performed
7730 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7731 *----------------------------------------------------------------------------*/
7732 
7733 int float128_le(float128 a, float128 b, float_status *status)
7734 {
7735     flag aSign, bSign;
7736 
7737     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7738               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7739          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7740               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7741        ) {
7742         float_raise(float_flag_invalid, status);
7743         return 0;
7744     }
7745     aSign = extractFloat128Sign( a );
7746     bSign = extractFloat128Sign( b );
7747     if ( aSign != bSign ) {
7748         return
7749                aSign
7750             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7751                  == 0 );
7752     }
7753     return
7754           aSign ? le128( b.high, b.low, a.high, a.low )
7755         : le128( a.high, a.low, b.high, b.low );
7756 
7757 }
7758 
7759 /*----------------------------------------------------------------------------
7760 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7761 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7762 | raised if either operand is a NaN.  The comparison is performed according
7763 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7764 *----------------------------------------------------------------------------*/
7765 
7766 int float128_lt(float128 a, float128 b, float_status *status)
7767 {
7768     flag aSign, bSign;
7769 
7770     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7771               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7772          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7773               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7774        ) {
7775         float_raise(float_flag_invalid, status);
7776         return 0;
7777     }
7778     aSign = extractFloat128Sign( a );
7779     bSign = extractFloat128Sign( b );
7780     if ( aSign != bSign ) {
7781         return
7782                aSign
7783             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7784                  != 0 );
7785     }
7786     return
7787           aSign ? lt128( b.high, b.low, a.high, a.low )
7788         : lt128( a.high, a.low, b.high, b.low );
7789 
7790 }
7791 
7792 /*----------------------------------------------------------------------------
7793 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7794 | be compared, and 0 otherwise.  The invalid exception is raised if either
7795 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7796 | Standard for Binary Floating-Point Arithmetic.
7797 *----------------------------------------------------------------------------*/
7798 
7799 int float128_unordered(float128 a, float128 b, float_status *status)
7800 {
7801     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7802               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7803          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7804               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7805        ) {
7806         float_raise(float_flag_invalid, status);
7807         return 1;
7808     }
7809     return 0;
7810 }
7811 
7812 /*----------------------------------------------------------------------------
7813 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7814 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7815 | exception.  The comparison is performed according to the IEC/IEEE Standard
7816 | for Binary Floating-Point Arithmetic.
7817 *----------------------------------------------------------------------------*/
7818 
7819 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7820 {
7821 
7822     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7823               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7824          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7825               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7826        ) {
7827         if (float128_is_signaling_nan(a, status)
7828          || float128_is_signaling_nan(b, status)) {
7829             float_raise(float_flag_invalid, status);
7830         }
7831         return 0;
7832     }
7833     return
7834            ( a.low == b.low )
7835         && (    ( a.high == b.high )
7836              || (    ( a.low == 0 )
7837                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7838            );
7839 
7840 }
7841 
7842 /*----------------------------------------------------------------------------
7843 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7844 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7845 | cause an exception.  Otherwise, the comparison is performed according to the
7846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7847 *----------------------------------------------------------------------------*/
7848 
7849 int float128_le_quiet(float128 a, float128 b, float_status *status)
7850 {
7851     flag aSign, bSign;
7852 
7853     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7854               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7855          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7856               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7857        ) {
7858         if (float128_is_signaling_nan(a, status)
7859          || float128_is_signaling_nan(b, status)) {
7860             float_raise(float_flag_invalid, status);
7861         }
7862         return 0;
7863     }
7864     aSign = extractFloat128Sign( a );
7865     bSign = extractFloat128Sign( b );
7866     if ( aSign != bSign ) {
7867         return
7868                aSign
7869             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7870                  == 0 );
7871     }
7872     return
7873           aSign ? le128( b.high, b.low, a.high, a.low )
7874         : le128( a.high, a.low, b.high, b.low );
7875 
7876 }
7877 
7878 /*----------------------------------------------------------------------------
7879 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7880 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7881 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7882 | Standard for Binary Floating-Point Arithmetic.
7883 *----------------------------------------------------------------------------*/
7884 
7885 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7886 {
7887     flag aSign, bSign;
7888 
7889     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7890               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7891          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7892               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7893        ) {
7894         if (float128_is_signaling_nan(a, status)
7895          || float128_is_signaling_nan(b, status)) {
7896             float_raise(float_flag_invalid, status);
7897         }
7898         return 0;
7899     }
7900     aSign = extractFloat128Sign( a );
7901     bSign = extractFloat128Sign( b );
7902     if ( aSign != bSign ) {
7903         return
7904                aSign
7905             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7906                  != 0 );
7907     }
7908     return
7909           aSign ? lt128( b.high, b.low, a.high, a.low )
7910         : lt128( a.high, a.low, b.high, b.low );
7911 
7912 }
7913 
7914 /*----------------------------------------------------------------------------
7915 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7916 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7917 | comparison is performed according to the IEC/IEEE Standard for Binary
7918 | Floating-Point Arithmetic.
7919 *----------------------------------------------------------------------------*/
7920 
7921 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7922 {
7923     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7924               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7925          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7926               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7927        ) {
7928         if (float128_is_signaling_nan(a, status)
7929          || float128_is_signaling_nan(b, status)) {
7930             float_raise(float_flag_invalid, status);
7931         }
7932         return 1;
7933     }
7934     return 0;
7935 }
7936 
7937 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7938                                             int is_quiet, float_status *status)
7939 {
7940     flag aSign, bSign;
7941 
7942     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7943         float_raise(float_flag_invalid, status);
7944         return float_relation_unordered;
7945     }
7946     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7947           ( extractFloatx80Frac( a )<<1 ) ) ||
7948         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7949           ( extractFloatx80Frac( b )<<1 ) )) {
7950         if (!is_quiet ||
7951             floatx80_is_signaling_nan(a, status) ||
7952             floatx80_is_signaling_nan(b, status)) {
7953             float_raise(float_flag_invalid, status);
7954         }
7955         return float_relation_unordered;
7956     }
7957     aSign = extractFloatx80Sign( a );
7958     bSign = extractFloatx80Sign( b );
7959     if ( aSign != bSign ) {
7960 
7961         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7962              ( ( a.low | b.low ) == 0 ) ) {
7963             /* zero case */
7964             return float_relation_equal;
7965         } else {
7966             return 1 - (2 * aSign);
7967         }
7968     } else {
7969         /* Normalize pseudo-denormals before comparison.  */
7970         if ((a.high & 0x7fff) == 0 && a.low & UINT64_C(0x8000000000000000)) {
7971             ++a.high;
7972         }
7973         if ((b.high & 0x7fff) == 0 && b.low & UINT64_C(0x8000000000000000)) {
7974             ++b.high;
7975         }
7976         if (a.low == b.low && a.high == b.high) {
7977             return float_relation_equal;
7978         } else {
7979             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7980         }
7981     }
7982 }
7983 
7984 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7985 {
7986     return floatx80_compare_internal(a, b, 0, status);
7987 }
7988 
7989 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7990 {
7991     return floatx80_compare_internal(a, b, 1, status);
7992 }
7993 
7994 static inline int float128_compare_internal(float128 a, float128 b,
7995                                             int is_quiet, float_status *status)
7996 {
7997     flag aSign, bSign;
7998 
7999     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
8000           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
8001         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
8002           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
8003         if (!is_quiet ||
8004             float128_is_signaling_nan(a, status) ||
8005             float128_is_signaling_nan(b, status)) {
8006             float_raise(float_flag_invalid, status);
8007         }
8008         return float_relation_unordered;
8009     }
8010     aSign = extractFloat128Sign( a );
8011     bSign = extractFloat128Sign( b );
8012     if ( aSign != bSign ) {
8013         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
8014             /* zero case */
8015             return float_relation_equal;
8016         } else {
8017             return 1 - (2 * aSign);
8018         }
8019     } else {
8020         if (a.low == b.low && a.high == b.high) {
8021             return float_relation_equal;
8022         } else {
8023             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
8024         }
8025     }
8026 }
8027 
8028 int float128_compare(float128 a, float128 b, float_status *status)
8029 {
8030     return float128_compare_internal(a, b, 0, status);
8031 }
8032 
8033 int float128_compare_quiet(float128 a, float128 b, float_status *status)
8034 {
8035     return float128_compare_internal(a, b, 1, status);
8036 }
8037 
8038 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8039 {
8040     flag aSign;
8041     int32_t aExp;
8042     uint64_t aSig;
8043 
8044     if (floatx80_invalid_encoding(a)) {
8045         float_raise(float_flag_invalid, status);
8046         return floatx80_default_nan(status);
8047     }
8048     aSig = extractFloatx80Frac( a );
8049     aExp = extractFloatx80Exp( a );
8050     aSign = extractFloatx80Sign( a );
8051 
8052     if ( aExp == 0x7FFF ) {
8053         if ( aSig<<1 ) {
8054             return propagateFloatx80NaN(a, a, status);
8055         }
8056         return a;
8057     }
8058 
8059     if (aExp == 0) {
8060         if (aSig == 0) {
8061             return a;
8062         }
8063         aExp++;
8064     }
8065 
8066     if (n > 0x10000) {
8067         n = 0x10000;
8068     } else if (n < -0x10000) {
8069         n = -0x10000;
8070     }
8071 
8072     aExp += n;
8073     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8074                                          aSign, aExp, aSig, 0, status);
8075 }
8076 
8077 float128 float128_scalbn(float128 a, int n, float_status *status)
8078 {
8079     flag aSign;
8080     int32_t aExp;
8081     uint64_t aSig0, aSig1;
8082 
8083     aSig1 = extractFloat128Frac1( a );
8084     aSig0 = extractFloat128Frac0( a );
8085     aExp = extractFloat128Exp( a );
8086     aSign = extractFloat128Sign( a );
8087     if ( aExp == 0x7FFF ) {
8088         if ( aSig0 | aSig1 ) {
8089             return propagateFloat128NaN(a, a, status);
8090         }
8091         return a;
8092     }
8093     if (aExp != 0) {
8094         aSig0 |= UINT64_C(0x0001000000000000);
8095     } else if (aSig0 == 0 && aSig1 == 0) {
8096         return a;
8097     } else {
8098         aExp++;
8099     }
8100 
8101     if (n > 0x10000) {
8102         n = 0x10000;
8103     } else if (n < -0x10000) {
8104         n = -0x10000;
8105     }
8106 
8107     aExp += n - 1;
8108     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8109                                          , status);
8110 
8111 }
8112 
8113 static void __attribute__((constructor)) softfloat_init(void)
8114 {
8115     union_float64 ua, ub, uc, ur;
8116 
8117     if (QEMU_NO_HARDFLOAT) {
8118         return;
8119     }
8120     /*
8121      * Test that the host's FMA is not obviously broken. For example,
8122      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8123      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8124      */
8125     ua.s = 0x0020000000000001ULL;
8126     ub.s = 0x3ca0000000000000ULL;
8127     uc.s = 0x0020000000000000ULL;
8128     ur.h = fma(ua.h, ub.h, uc.h);
8129     if (ur.s != 0x0020000000000001ULL) {
8130         force_soft_fma = true;
8131     }
8132 }
8133