xref: /openbmc/qemu/fpu/softfloat.c (revision b33b890c)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 #include <math.h>
87 #include "qemu/bitops.h"
88 #include "fpu/softfloat.h"
89 
90 /* We only need stdlib for abort() */
91 
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations.  (Can be specialized to target if
95 | desired.)
96 *----------------------------------------------------------------------------*/
97 #include "fpu/softfloat-macros.h"
98 
99 /*
100  * Hardfloat
101  *
102  * Fast emulation of guest FP instructions is challenging for two reasons.
103  * First, FP instruction semantics are similar but not identical, particularly
104  * when handling NaNs. Second, emulating at reasonable speed the guest FP
105  * exception flags is not trivial: reading the host's flags register with a
106  * feclearexcept & fetestexcept pair is slow [slightly slower than soft-fp],
107  * and trapping on every FP exception is not fast nor pleasant to work with.
108  *
109  * We address these challenges by leveraging the host FPU for a subset of the
110  * operations. To do this we expand on the idea presented in this paper:
111  *
112  * Guo, Yu-Chuan, et al. "Translating the ARM Neon and VFP instructions in a
113  * binary translator." Software: Practice and Experience 46.12 (2016):1591-1615.
114  *
115  * The idea is thus to leverage the host FPU to (1) compute FP operations
116  * and (2) identify whether FP exceptions occurred while avoiding
117  * expensive exception flag register accesses.
118  *
119  * An important optimization shown in the paper is that given that exception
120  * flags are rarely cleared by the guest, we can avoid recomputing some flags.
121  * This is particularly useful for the inexact flag, which is very frequently
122  * raised in floating-point workloads.
123  *
124  * We optimize the code further by deferring to soft-fp whenever FP exception
125  * detection might get hairy. Two examples: (1) when at least one operand is
126  * denormal/inf/NaN; (2) when operands are not guaranteed to lead to a 0 result
127  * and the result is < the minimum normal.
128  */
129 #define GEN_INPUT_FLUSH__NOCHECK(name, soft_t)                          \
130     static inline void name(soft_t *a, float_status *s)                 \
131     {                                                                   \
132         if (unlikely(soft_t ## _is_denormal(*a))) {                     \
133             *a = soft_t ## _set_sign(soft_t ## _zero,                   \
134                                      soft_t ## _is_neg(*a));            \
135             s->float_exception_flags |= float_flag_input_denormal;      \
136         }                                                               \
137     }
138 
139 GEN_INPUT_FLUSH__NOCHECK(float32_input_flush__nocheck, float32)
140 GEN_INPUT_FLUSH__NOCHECK(float64_input_flush__nocheck, float64)
141 #undef GEN_INPUT_FLUSH__NOCHECK
142 
143 #define GEN_INPUT_FLUSH1(name, soft_t)                  \
144     static inline void name(soft_t *a, float_status *s) \
145     {                                                   \
146         if (likely(!s->flush_inputs_to_zero)) {         \
147             return;                                     \
148         }                                               \
149         soft_t ## _input_flush__nocheck(a, s);          \
150     }
151 
152 GEN_INPUT_FLUSH1(float32_input_flush1, float32)
153 GEN_INPUT_FLUSH1(float64_input_flush1, float64)
154 #undef GEN_INPUT_FLUSH1
155 
156 #define GEN_INPUT_FLUSH2(name, soft_t)                                  \
157     static inline void name(soft_t *a, soft_t *b, float_status *s)      \
158     {                                                                   \
159         if (likely(!s->flush_inputs_to_zero)) {                         \
160             return;                                                     \
161         }                                                               \
162         soft_t ## _input_flush__nocheck(a, s);                          \
163         soft_t ## _input_flush__nocheck(b, s);                          \
164     }
165 
166 GEN_INPUT_FLUSH2(float32_input_flush2, float32)
167 GEN_INPUT_FLUSH2(float64_input_flush2, float64)
168 #undef GEN_INPUT_FLUSH2
169 
170 #define GEN_INPUT_FLUSH3(name, soft_t)                                  \
171     static inline void name(soft_t *a, soft_t *b, soft_t *c, float_status *s) \
172     {                                                                   \
173         if (likely(!s->flush_inputs_to_zero)) {                         \
174             return;                                                     \
175         }                                                               \
176         soft_t ## _input_flush__nocheck(a, s);                          \
177         soft_t ## _input_flush__nocheck(b, s);                          \
178         soft_t ## _input_flush__nocheck(c, s);                          \
179     }
180 
181 GEN_INPUT_FLUSH3(float32_input_flush3, float32)
182 GEN_INPUT_FLUSH3(float64_input_flush3, float64)
183 #undef GEN_INPUT_FLUSH3
184 
185 /*
186  * Choose whether to use fpclassify or float32/64_* primitives in the generated
187  * hardfloat functions. Each combination of number of inputs and float size
188  * gets its own value.
189  */
190 #if defined(__x86_64__)
191 # define QEMU_HARDFLOAT_1F32_USE_FP 0
192 # define QEMU_HARDFLOAT_1F64_USE_FP 1
193 # define QEMU_HARDFLOAT_2F32_USE_FP 0
194 # define QEMU_HARDFLOAT_2F64_USE_FP 1
195 # define QEMU_HARDFLOAT_3F32_USE_FP 0
196 # define QEMU_HARDFLOAT_3F64_USE_FP 1
197 #else
198 # define QEMU_HARDFLOAT_1F32_USE_FP 0
199 # define QEMU_HARDFLOAT_1F64_USE_FP 0
200 # define QEMU_HARDFLOAT_2F32_USE_FP 0
201 # define QEMU_HARDFLOAT_2F64_USE_FP 0
202 # define QEMU_HARDFLOAT_3F32_USE_FP 0
203 # define QEMU_HARDFLOAT_3F64_USE_FP 0
204 #endif
205 
206 /*
207  * QEMU_HARDFLOAT_USE_ISINF chooses whether to use isinf() over
208  * float{32,64}_is_infinity when !USE_FP.
209  * On x86_64/aarch64, using the former over the latter can yield a ~6% speedup.
210  * On power64 however, using isinf() reduces fp-bench performance by up to 50%.
211  */
212 #if defined(__x86_64__) || defined(__aarch64__)
213 # define QEMU_HARDFLOAT_USE_ISINF   1
214 #else
215 # define QEMU_HARDFLOAT_USE_ISINF   0
216 #endif
217 
218 /*
219  * Some targets clear the FP flags before most FP operations. This prevents
220  * the use of hardfloat, since hardfloat relies on the inexact flag being
221  * already set.
222  */
223 #if defined(TARGET_PPC) || defined(__FAST_MATH__)
224 # if defined(__FAST_MATH__)
225 #  warning disabling hardfloat due to -ffast-math: hardfloat requires an exact \
226     IEEE implementation
227 # endif
228 # define QEMU_NO_HARDFLOAT 1
229 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN
230 #else
231 # define QEMU_NO_HARDFLOAT 0
232 # define QEMU_SOFTFLOAT_ATTR QEMU_FLATTEN __attribute__((noinline))
233 #endif
234 
235 static inline bool can_use_fpu(const float_status *s)
236 {
237     if (QEMU_NO_HARDFLOAT) {
238         return false;
239     }
240     return likely(s->float_exception_flags & float_flag_inexact &&
241                   s->float_rounding_mode == float_round_nearest_even);
242 }
243 
244 /*
245  * Hardfloat generation functions. Each operation can have two flavors:
246  * either using softfloat primitives (e.g. float32_is_zero_or_normal) for
247  * most condition checks, or native ones (e.g. fpclassify).
248  *
249  * The flavor is chosen by the callers. Instead of using macros, we rely on the
250  * compiler to propagate constants and inline everything into the callers.
251  *
252  * We only generate functions for operations with two inputs, since only
253  * these are common enough to justify consolidating them into common code.
254  */
255 
256 typedef union {
257     float32 s;
258     float h;
259 } union_float32;
260 
261 typedef union {
262     float64 s;
263     double h;
264 } union_float64;
265 
266 typedef bool (*f32_check_fn)(union_float32 a, union_float32 b);
267 typedef bool (*f64_check_fn)(union_float64 a, union_float64 b);
268 
269 typedef float32 (*soft_f32_op2_fn)(float32 a, float32 b, float_status *s);
270 typedef float64 (*soft_f64_op2_fn)(float64 a, float64 b, float_status *s);
271 typedef float   (*hard_f32_op2_fn)(float a, float b);
272 typedef double  (*hard_f64_op2_fn)(double a, double b);
273 
274 /* 2-input is-zero-or-normal */
275 static inline bool f32_is_zon2(union_float32 a, union_float32 b)
276 {
277     if (QEMU_HARDFLOAT_2F32_USE_FP) {
278         /*
279          * Not using a temp variable for consecutive fpclassify calls ends up
280          * generating faster code.
281          */
282         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
283                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
284     }
285     return float32_is_zero_or_normal(a.s) &&
286            float32_is_zero_or_normal(b.s);
287 }
288 
289 static inline bool f64_is_zon2(union_float64 a, union_float64 b)
290 {
291     if (QEMU_HARDFLOAT_2F64_USE_FP) {
292         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
293                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO);
294     }
295     return float64_is_zero_or_normal(a.s) &&
296            float64_is_zero_or_normal(b.s);
297 }
298 
299 /* 3-input is-zero-or-normal */
300 static inline
301 bool f32_is_zon3(union_float32 a, union_float32 b, union_float32 c)
302 {
303     if (QEMU_HARDFLOAT_3F32_USE_FP) {
304         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
305                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
306                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
307     }
308     return float32_is_zero_or_normal(a.s) &&
309            float32_is_zero_or_normal(b.s) &&
310            float32_is_zero_or_normal(c.s);
311 }
312 
313 static inline
314 bool f64_is_zon3(union_float64 a, union_float64 b, union_float64 c)
315 {
316     if (QEMU_HARDFLOAT_3F64_USE_FP) {
317         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
318                (fpclassify(b.h) == FP_NORMAL || fpclassify(b.h) == FP_ZERO) &&
319                (fpclassify(c.h) == FP_NORMAL || fpclassify(c.h) == FP_ZERO);
320     }
321     return float64_is_zero_or_normal(a.s) &&
322            float64_is_zero_or_normal(b.s) &&
323            float64_is_zero_or_normal(c.s);
324 }
325 
326 static inline bool f32_is_inf(union_float32 a)
327 {
328     if (QEMU_HARDFLOAT_USE_ISINF) {
329         return isinf(a.h);
330     }
331     return float32_is_infinity(a.s);
332 }
333 
334 static inline bool f64_is_inf(union_float64 a)
335 {
336     if (QEMU_HARDFLOAT_USE_ISINF) {
337         return isinf(a.h);
338     }
339     return float64_is_infinity(a.s);
340 }
341 
342 /* Note: @fast_test and @post can be NULL */
343 static inline float32
344 float32_gen2(float32 xa, float32 xb, float_status *s,
345              hard_f32_op2_fn hard, soft_f32_op2_fn soft,
346              f32_check_fn pre, f32_check_fn post,
347              f32_check_fn fast_test, soft_f32_op2_fn fast_op)
348 {
349     union_float32 ua, ub, ur;
350 
351     ua.s = xa;
352     ub.s = xb;
353 
354     if (unlikely(!can_use_fpu(s))) {
355         goto soft;
356     }
357 
358     float32_input_flush2(&ua.s, &ub.s, s);
359     if (unlikely(!pre(ua, ub))) {
360         goto soft;
361     }
362     if (fast_test && fast_test(ua, ub)) {
363         return fast_op(ua.s, ub.s, s);
364     }
365 
366     ur.h = hard(ua.h, ub.h);
367     if (unlikely(f32_is_inf(ur))) {
368         s->float_exception_flags |= float_flag_overflow;
369     } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
370         if (post == NULL || post(ua, ub)) {
371             goto soft;
372         }
373     }
374     return ur.s;
375 
376  soft:
377     return soft(ua.s, ub.s, s);
378 }
379 
380 static inline float64
381 float64_gen2(float64 xa, float64 xb, float_status *s,
382              hard_f64_op2_fn hard, soft_f64_op2_fn soft,
383              f64_check_fn pre, f64_check_fn post,
384              f64_check_fn fast_test, soft_f64_op2_fn fast_op)
385 {
386     union_float64 ua, ub, ur;
387 
388     ua.s = xa;
389     ub.s = xb;
390 
391     if (unlikely(!can_use_fpu(s))) {
392         goto soft;
393     }
394 
395     float64_input_flush2(&ua.s, &ub.s, s);
396     if (unlikely(!pre(ua, ub))) {
397         goto soft;
398     }
399     if (fast_test && fast_test(ua, ub)) {
400         return fast_op(ua.s, ub.s, s);
401     }
402 
403     ur.h = hard(ua.h, ub.h);
404     if (unlikely(f64_is_inf(ur))) {
405         s->float_exception_flags |= float_flag_overflow;
406     } else if (unlikely(fabs(ur.h) <= DBL_MIN)) {
407         if (post == NULL || post(ua, ub)) {
408             goto soft;
409         }
410     }
411     return ur.s;
412 
413  soft:
414     return soft(ua.s, ub.s, s);
415 }
416 
417 /*----------------------------------------------------------------------------
418 | Returns the fraction bits of the half-precision floating-point value `a'.
419 *----------------------------------------------------------------------------*/
420 
421 static inline uint32_t extractFloat16Frac(float16 a)
422 {
423     return float16_val(a) & 0x3ff;
424 }
425 
426 /*----------------------------------------------------------------------------
427 | Returns the exponent bits of the half-precision floating-point value `a'.
428 *----------------------------------------------------------------------------*/
429 
430 static inline int extractFloat16Exp(float16 a)
431 {
432     return (float16_val(a) >> 10) & 0x1f;
433 }
434 
435 /*----------------------------------------------------------------------------
436 | Returns the fraction bits of the single-precision floating-point value `a'.
437 *----------------------------------------------------------------------------*/
438 
439 static inline uint32_t extractFloat32Frac(float32 a)
440 {
441     return float32_val(a) & 0x007FFFFF;
442 }
443 
444 /*----------------------------------------------------------------------------
445 | Returns the exponent bits of the single-precision floating-point value `a'.
446 *----------------------------------------------------------------------------*/
447 
448 static inline int extractFloat32Exp(float32 a)
449 {
450     return (float32_val(a) >> 23) & 0xFF;
451 }
452 
453 /*----------------------------------------------------------------------------
454 | Returns the sign bit of the single-precision floating-point value `a'.
455 *----------------------------------------------------------------------------*/
456 
457 static inline flag extractFloat32Sign(float32 a)
458 {
459     return float32_val(a) >> 31;
460 }
461 
462 /*----------------------------------------------------------------------------
463 | Returns the fraction bits of the double-precision floating-point value `a'.
464 *----------------------------------------------------------------------------*/
465 
466 static inline uint64_t extractFloat64Frac(float64 a)
467 {
468     return float64_val(a) & LIT64(0x000FFFFFFFFFFFFF);
469 }
470 
471 /*----------------------------------------------------------------------------
472 | Returns the exponent bits of the double-precision floating-point value `a'.
473 *----------------------------------------------------------------------------*/
474 
475 static inline int extractFloat64Exp(float64 a)
476 {
477     return (float64_val(a) >> 52) & 0x7FF;
478 }
479 
480 /*----------------------------------------------------------------------------
481 | Returns the sign bit of the double-precision floating-point value `a'.
482 *----------------------------------------------------------------------------*/
483 
484 static inline flag extractFloat64Sign(float64 a)
485 {
486     return float64_val(a) >> 63;
487 }
488 
489 /*
490  * Classify a floating point number. Everything above float_class_qnan
491  * is a NaN so cls >= float_class_qnan is any NaN.
492  */
493 
494 typedef enum __attribute__ ((__packed__)) {
495     float_class_unclassified,
496     float_class_zero,
497     float_class_normal,
498     float_class_inf,
499     float_class_qnan,  /* all NaNs from here */
500     float_class_snan,
501 } FloatClass;
502 
503 /* Simple helpers for checking if, or what kind of, NaN we have */
504 static inline __attribute__((unused)) bool is_nan(FloatClass c)
505 {
506     return unlikely(c >= float_class_qnan);
507 }
508 
509 static inline __attribute__((unused)) bool is_snan(FloatClass c)
510 {
511     return c == float_class_snan;
512 }
513 
514 static inline __attribute__((unused)) bool is_qnan(FloatClass c)
515 {
516     return c == float_class_qnan;
517 }
518 
519 /*
520  * Structure holding all of the decomposed parts of a float. The
521  * exponent is unbiased and the fraction is normalized. All
522  * calculations are done with a 64 bit fraction and then rounded as
523  * appropriate for the final format.
524  *
525  * Thanks to the packed FloatClass a decent compiler should be able to
526  * fit the whole structure into registers and avoid using the stack
527  * for parameter passing.
528  */
529 
530 typedef struct {
531     uint64_t frac;
532     int32_t  exp;
533     FloatClass cls;
534     bool sign;
535 } FloatParts;
536 
537 #define DECOMPOSED_BINARY_POINT    (64 - 2)
538 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
539 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
540 
541 /* Structure holding all of the relevant parameters for a format.
542  *   exp_size: the size of the exponent field
543  *   exp_bias: the offset applied to the exponent field
544  *   exp_max: the maximum normalised exponent
545  *   frac_size: the size of the fraction field
546  *   frac_shift: shift to normalise the fraction with DECOMPOSED_BINARY_POINT
547  * The following are computed based the size of fraction
548  *   frac_lsb: least significant bit of fraction
549  *   frac_lsbm1: the bit below the least significant bit (for rounding)
550  *   round_mask/roundeven_mask: masks used for rounding
551  * The following optional modifiers are available:
552  *   arm_althp: handle ARM Alternative Half Precision
553  */
554 typedef struct {
555     int exp_size;
556     int exp_bias;
557     int exp_max;
558     int frac_size;
559     int frac_shift;
560     uint64_t frac_lsb;
561     uint64_t frac_lsbm1;
562     uint64_t round_mask;
563     uint64_t roundeven_mask;
564     bool arm_althp;
565 } FloatFmt;
566 
567 /* Expand fields based on the size of exponent and fraction */
568 #define FLOAT_PARAMS(E, F)                                           \
569     .exp_size       = E,                                             \
570     .exp_bias       = ((1 << E) - 1) >> 1,                           \
571     .exp_max        = (1 << E) - 1,                                  \
572     .frac_size      = F,                                             \
573     .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
574     .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
575     .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
576     .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
577     .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
578 
579 static const FloatFmt float16_params = {
580     FLOAT_PARAMS(5, 10)
581 };
582 
583 static const FloatFmt float16_params_ahp = {
584     FLOAT_PARAMS(5, 10),
585     .arm_althp = true
586 };
587 
588 static const FloatFmt float32_params = {
589     FLOAT_PARAMS(8, 23)
590 };
591 
592 static const FloatFmt float64_params = {
593     FLOAT_PARAMS(11, 52)
594 };
595 
596 /* Unpack a float to parts, but do not canonicalize.  */
597 static inline FloatParts unpack_raw(FloatFmt fmt, uint64_t raw)
598 {
599     const int sign_pos = fmt.frac_size + fmt.exp_size;
600 
601     return (FloatParts) {
602         .cls = float_class_unclassified,
603         .sign = extract64(raw, sign_pos, 1),
604         .exp = extract64(raw, fmt.frac_size, fmt.exp_size),
605         .frac = extract64(raw, 0, fmt.frac_size),
606     };
607 }
608 
609 static inline FloatParts float16_unpack_raw(float16 f)
610 {
611     return unpack_raw(float16_params, f);
612 }
613 
614 static inline FloatParts float32_unpack_raw(float32 f)
615 {
616     return unpack_raw(float32_params, f);
617 }
618 
619 static inline FloatParts float64_unpack_raw(float64 f)
620 {
621     return unpack_raw(float64_params, f);
622 }
623 
624 /* Pack a float from parts, but do not canonicalize.  */
625 static inline uint64_t pack_raw(FloatFmt fmt, FloatParts p)
626 {
627     const int sign_pos = fmt.frac_size + fmt.exp_size;
628     uint64_t ret = deposit64(p.frac, fmt.frac_size, fmt.exp_size, p.exp);
629     return deposit64(ret, sign_pos, 1, p.sign);
630 }
631 
632 static inline float16 float16_pack_raw(FloatParts p)
633 {
634     return make_float16(pack_raw(float16_params, p));
635 }
636 
637 static inline float32 float32_pack_raw(FloatParts p)
638 {
639     return make_float32(pack_raw(float32_params, p));
640 }
641 
642 static inline float64 float64_pack_raw(FloatParts p)
643 {
644     return make_float64(pack_raw(float64_params, p));
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Functions and definitions to determine:  (1) whether tininess for underflow
649 | is detected before or after rounding by default, (2) what (if anything)
650 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
651 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
652 | are propagated from function inputs to output.  These details are target-
653 | specific.
654 *----------------------------------------------------------------------------*/
655 #include "softfloat-specialize.h"
656 
657 /* Canonicalize EXP and FRAC, setting CLS.  */
658 static FloatParts sf_canonicalize(FloatParts part, const FloatFmt *parm,
659                                   float_status *status)
660 {
661     if (part.exp == parm->exp_max && !parm->arm_althp) {
662         if (part.frac == 0) {
663             part.cls = float_class_inf;
664         } else {
665             part.frac <<= parm->frac_shift;
666             part.cls = (parts_is_snan_frac(part.frac, status)
667                         ? float_class_snan : float_class_qnan);
668         }
669     } else if (part.exp == 0) {
670         if (likely(part.frac == 0)) {
671             part.cls = float_class_zero;
672         } else if (status->flush_inputs_to_zero) {
673             float_raise(float_flag_input_denormal, status);
674             part.cls = float_class_zero;
675             part.frac = 0;
676         } else {
677             int shift = clz64(part.frac) - 1;
678             part.cls = float_class_normal;
679             part.exp = parm->frac_shift - parm->exp_bias - shift + 1;
680             part.frac <<= shift;
681         }
682     } else {
683         part.cls = float_class_normal;
684         part.exp -= parm->exp_bias;
685         part.frac = DECOMPOSED_IMPLICIT_BIT + (part.frac << parm->frac_shift);
686     }
687     return part;
688 }
689 
690 /* Round and uncanonicalize a floating-point number by parts. There
691  * are FRAC_SHIFT bits that may require rounding at the bottom of the
692  * fraction; these bits will be removed. The exponent will be biased
693  * by EXP_BIAS and must be bounded by [EXP_MAX-1, 0].
694  */
695 
696 static FloatParts round_canonical(FloatParts p, float_status *s,
697                                   const FloatFmt *parm)
698 {
699     const uint64_t frac_lsb = parm->frac_lsb;
700     const uint64_t frac_lsbm1 = parm->frac_lsbm1;
701     const uint64_t round_mask = parm->round_mask;
702     const uint64_t roundeven_mask = parm->roundeven_mask;
703     const int exp_max = parm->exp_max;
704     const int frac_shift = parm->frac_shift;
705     uint64_t frac, inc;
706     int exp, flags = 0;
707     bool overflow_norm;
708 
709     frac = p.frac;
710     exp = p.exp;
711 
712     switch (p.cls) {
713     case float_class_normal:
714         switch (s->float_rounding_mode) {
715         case float_round_nearest_even:
716             overflow_norm = false;
717             inc = ((frac & roundeven_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
718             break;
719         case float_round_ties_away:
720             overflow_norm = false;
721             inc = frac_lsbm1;
722             break;
723         case float_round_to_zero:
724             overflow_norm = true;
725             inc = 0;
726             break;
727         case float_round_up:
728             inc = p.sign ? 0 : round_mask;
729             overflow_norm = p.sign;
730             break;
731         case float_round_down:
732             inc = p.sign ? round_mask : 0;
733             overflow_norm = !p.sign;
734             break;
735         case float_round_to_odd:
736             overflow_norm = true;
737             inc = frac & frac_lsb ? 0 : round_mask;
738             break;
739         default:
740             g_assert_not_reached();
741         }
742 
743         exp += parm->exp_bias;
744         if (likely(exp > 0)) {
745             if (frac & round_mask) {
746                 flags |= float_flag_inexact;
747                 frac += inc;
748                 if (frac & DECOMPOSED_OVERFLOW_BIT) {
749                     frac >>= 1;
750                     exp++;
751                 }
752             }
753             frac >>= frac_shift;
754 
755             if (parm->arm_althp) {
756                 /* ARM Alt HP eschews Inf and NaN for a wider exponent.  */
757                 if (unlikely(exp > exp_max)) {
758                     /* Overflow.  Return the maximum normal.  */
759                     flags = float_flag_invalid;
760                     exp = exp_max;
761                     frac = -1;
762                 }
763             } else if (unlikely(exp >= exp_max)) {
764                 flags |= float_flag_overflow | float_flag_inexact;
765                 if (overflow_norm) {
766                     exp = exp_max - 1;
767                     frac = -1;
768                 } else {
769                     p.cls = float_class_inf;
770                     goto do_inf;
771                 }
772             }
773         } else if (s->flush_to_zero) {
774             flags |= float_flag_output_denormal;
775             p.cls = float_class_zero;
776             goto do_zero;
777         } else {
778             bool is_tiny = (s->float_detect_tininess
779                             == float_tininess_before_rounding)
780                         || (exp < 0)
781                         || !((frac + inc) & DECOMPOSED_OVERFLOW_BIT);
782 
783             shift64RightJamming(frac, 1 - exp, &frac);
784             if (frac & round_mask) {
785                 /* Need to recompute round-to-even.  */
786                 switch (s->float_rounding_mode) {
787                 case float_round_nearest_even:
788                     inc = ((frac & roundeven_mask) != frac_lsbm1
789                            ? frac_lsbm1 : 0);
790                     break;
791                 case float_round_to_odd:
792                     inc = frac & frac_lsb ? 0 : round_mask;
793                     break;
794                 }
795                 flags |= float_flag_inexact;
796                 frac += inc;
797             }
798 
799             exp = (frac & DECOMPOSED_IMPLICIT_BIT ? 1 : 0);
800             frac >>= frac_shift;
801 
802             if (is_tiny && (flags & float_flag_inexact)) {
803                 flags |= float_flag_underflow;
804             }
805             if (exp == 0 && frac == 0) {
806                 p.cls = float_class_zero;
807             }
808         }
809         break;
810 
811     case float_class_zero:
812     do_zero:
813         exp = 0;
814         frac = 0;
815         break;
816 
817     case float_class_inf:
818     do_inf:
819         assert(!parm->arm_althp);
820         exp = exp_max;
821         frac = 0;
822         break;
823 
824     case float_class_qnan:
825     case float_class_snan:
826         assert(!parm->arm_althp);
827         exp = exp_max;
828         frac >>= parm->frac_shift;
829         break;
830 
831     default:
832         g_assert_not_reached();
833     }
834 
835     float_raise(flags, s);
836     p.exp = exp;
837     p.frac = frac;
838     return p;
839 }
840 
841 /* Explicit FloatFmt version */
842 static FloatParts float16a_unpack_canonical(float16 f, float_status *s,
843                                             const FloatFmt *params)
844 {
845     return sf_canonicalize(float16_unpack_raw(f), params, s);
846 }
847 
848 static FloatParts float16_unpack_canonical(float16 f, float_status *s)
849 {
850     return float16a_unpack_canonical(f, s, &float16_params);
851 }
852 
853 static float16 float16a_round_pack_canonical(FloatParts p, float_status *s,
854                                              const FloatFmt *params)
855 {
856     return float16_pack_raw(round_canonical(p, s, params));
857 }
858 
859 static float16 float16_round_pack_canonical(FloatParts p, float_status *s)
860 {
861     return float16a_round_pack_canonical(p, s, &float16_params);
862 }
863 
864 static FloatParts float32_unpack_canonical(float32 f, float_status *s)
865 {
866     return sf_canonicalize(float32_unpack_raw(f), &float32_params, s);
867 }
868 
869 static float32 float32_round_pack_canonical(FloatParts p, float_status *s)
870 {
871     return float32_pack_raw(round_canonical(p, s, &float32_params));
872 }
873 
874 static FloatParts float64_unpack_canonical(float64 f, float_status *s)
875 {
876     return sf_canonicalize(float64_unpack_raw(f), &float64_params, s);
877 }
878 
879 static float64 float64_round_pack_canonical(FloatParts p, float_status *s)
880 {
881     return float64_pack_raw(round_canonical(p, s, &float64_params));
882 }
883 
884 static FloatParts return_nan(FloatParts a, float_status *s)
885 {
886     switch (a.cls) {
887     case float_class_snan:
888         s->float_exception_flags |= float_flag_invalid;
889         a = parts_silence_nan(a, s);
890         /* fall through */
891     case float_class_qnan:
892         if (s->default_nan_mode) {
893             return parts_default_nan(s);
894         }
895         break;
896 
897     default:
898         g_assert_not_reached();
899     }
900     return a;
901 }
902 
903 static FloatParts pick_nan(FloatParts a, FloatParts b, float_status *s)
904 {
905     if (is_snan(a.cls) || is_snan(b.cls)) {
906         s->float_exception_flags |= float_flag_invalid;
907     }
908 
909     if (s->default_nan_mode) {
910         return parts_default_nan(s);
911     } else {
912         if (pickNaN(a.cls, b.cls,
913                     a.frac > b.frac ||
914                     (a.frac == b.frac && a.sign < b.sign))) {
915             a = b;
916         }
917         if (is_snan(a.cls)) {
918             return parts_silence_nan(a, s);
919         }
920     }
921     return a;
922 }
923 
924 static FloatParts pick_nan_muladd(FloatParts a, FloatParts b, FloatParts c,
925                                   bool inf_zero, float_status *s)
926 {
927     int which;
928 
929     if (is_snan(a.cls) || is_snan(b.cls) || is_snan(c.cls)) {
930         s->float_exception_flags |= float_flag_invalid;
931     }
932 
933     which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, s);
934 
935     if (s->default_nan_mode) {
936         /* Note that this check is after pickNaNMulAdd so that function
937          * has an opportunity to set the Invalid flag.
938          */
939         which = 3;
940     }
941 
942     switch (which) {
943     case 0:
944         break;
945     case 1:
946         a = b;
947         break;
948     case 2:
949         a = c;
950         break;
951     case 3:
952         return parts_default_nan(s);
953     default:
954         g_assert_not_reached();
955     }
956 
957     if (is_snan(a.cls)) {
958         return parts_silence_nan(a, s);
959     }
960     return a;
961 }
962 
963 /*
964  * Returns the result of adding or subtracting the values of the
965  * floating-point values `a' and `b'. The operation is performed
966  * according to the IEC/IEEE Standard for Binary Floating-Point
967  * Arithmetic.
968  */
969 
970 static FloatParts addsub_floats(FloatParts a, FloatParts b, bool subtract,
971                                 float_status *s)
972 {
973     bool a_sign = a.sign;
974     bool b_sign = b.sign ^ subtract;
975 
976     if (a_sign != b_sign) {
977         /* Subtraction */
978 
979         if (a.cls == float_class_normal && b.cls == float_class_normal) {
980             if (a.exp > b.exp || (a.exp == b.exp && a.frac >= b.frac)) {
981                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
982                 a.frac = a.frac - b.frac;
983             } else {
984                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
985                 a.frac = b.frac - a.frac;
986                 a.exp = b.exp;
987                 a_sign ^= 1;
988             }
989 
990             if (a.frac == 0) {
991                 a.cls = float_class_zero;
992                 a.sign = s->float_rounding_mode == float_round_down;
993             } else {
994                 int shift = clz64(a.frac) - 1;
995                 a.frac = a.frac << shift;
996                 a.exp = a.exp - shift;
997                 a.sign = a_sign;
998             }
999             return a;
1000         }
1001         if (is_nan(a.cls) || is_nan(b.cls)) {
1002             return pick_nan(a, b, s);
1003         }
1004         if (a.cls == float_class_inf) {
1005             if (b.cls == float_class_inf) {
1006                 float_raise(float_flag_invalid, s);
1007                 return parts_default_nan(s);
1008             }
1009             return a;
1010         }
1011         if (a.cls == float_class_zero && b.cls == float_class_zero) {
1012             a.sign = s->float_rounding_mode == float_round_down;
1013             return a;
1014         }
1015         if (a.cls == float_class_zero || b.cls == float_class_inf) {
1016             b.sign = a_sign ^ 1;
1017             return b;
1018         }
1019         if (b.cls == float_class_zero) {
1020             return a;
1021         }
1022     } else {
1023         /* Addition */
1024         if (a.cls == float_class_normal && b.cls == float_class_normal) {
1025             if (a.exp > b.exp) {
1026                 shift64RightJamming(b.frac, a.exp - b.exp, &b.frac);
1027             } else if (a.exp < b.exp) {
1028                 shift64RightJamming(a.frac, b.exp - a.exp, &a.frac);
1029                 a.exp = b.exp;
1030             }
1031             a.frac += b.frac;
1032             if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
1033                 shift64RightJamming(a.frac, 1, &a.frac);
1034                 a.exp += 1;
1035             }
1036             return a;
1037         }
1038         if (is_nan(a.cls) || is_nan(b.cls)) {
1039             return pick_nan(a, b, s);
1040         }
1041         if (a.cls == float_class_inf || b.cls == float_class_zero) {
1042             return a;
1043         }
1044         if (b.cls == float_class_inf || a.cls == float_class_zero) {
1045             b.sign = b_sign;
1046             return b;
1047         }
1048     }
1049     g_assert_not_reached();
1050 }
1051 
1052 /*
1053  * Returns the result of adding or subtracting the floating-point
1054  * values `a' and `b'. The operation is performed according to the
1055  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1056  */
1057 
1058 float16 QEMU_FLATTEN float16_add(float16 a, float16 b, float_status *status)
1059 {
1060     FloatParts pa = float16_unpack_canonical(a, status);
1061     FloatParts pb = float16_unpack_canonical(b, status);
1062     FloatParts pr = addsub_floats(pa, pb, false, status);
1063 
1064     return float16_round_pack_canonical(pr, status);
1065 }
1066 
1067 float16 QEMU_FLATTEN float16_sub(float16 a, float16 b, float_status *status)
1068 {
1069     FloatParts pa = float16_unpack_canonical(a, status);
1070     FloatParts pb = float16_unpack_canonical(b, status);
1071     FloatParts pr = addsub_floats(pa, pb, true, status);
1072 
1073     return float16_round_pack_canonical(pr, status);
1074 }
1075 
1076 static float32 QEMU_SOFTFLOAT_ATTR
1077 soft_f32_addsub(float32 a, float32 b, bool subtract, float_status *status)
1078 {
1079     FloatParts pa = float32_unpack_canonical(a, status);
1080     FloatParts pb = float32_unpack_canonical(b, status);
1081     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1082 
1083     return float32_round_pack_canonical(pr, status);
1084 }
1085 
1086 static inline float32 soft_f32_add(float32 a, float32 b, float_status *status)
1087 {
1088     return soft_f32_addsub(a, b, false, status);
1089 }
1090 
1091 static inline float32 soft_f32_sub(float32 a, float32 b, float_status *status)
1092 {
1093     return soft_f32_addsub(a, b, true, status);
1094 }
1095 
1096 static float64 QEMU_SOFTFLOAT_ATTR
1097 soft_f64_addsub(float64 a, float64 b, bool subtract, float_status *status)
1098 {
1099     FloatParts pa = float64_unpack_canonical(a, status);
1100     FloatParts pb = float64_unpack_canonical(b, status);
1101     FloatParts pr = addsub_floats(pa, pb, subtract, status);
1102 
1103     return float64_round_pack_canonical(pr, status);
1104 }
1105 
1106 static inline float64 soft_f64_add(float64 a, float64 b, float_status *status)
1107 {
1108     return soft_f64_addsub(a, b, false, status);
1109 }
1110 
1111 static inline float64 soft_f64_sub(float64 a, float64 b, float_status *status)
1112 {
1113     return soft_f64_addsub(a, b, true, status);
1114 }
1115 
1116 static float hard_f32_add(float a, float b)
1117 {
1118     return a + b;
1119 }
1120 
1121 static float hard_f32_sub(float a, float b)
1122 {
1123     return a - b;
1124 }
1125 
1126 static double hard_f64_add(double a, double b)
1127 {
1128     return a + b;
1129 }
1130 
1131 static double hard_f64_sub(double a, double b)
1132 {
1133     return a - b;
1134 }
1135 
1136 static bool f32_addsub_post(union_float32 a, union_float32 b)
1137 {
1138     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1139         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1140     }
1141     return !(float32_is_zero(a.s) && float32_is_zero(b.s));
1142 }
1143 
1144 static bool f64_addsub_post(union_float64 a, union_float64 b)
1145 {
1146     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1147         return !(fpclassify(a.h) == FP_ZERO && fpclassify(b.h) == FP_ZERO);
1148     } else {
1149         return !(float64_is_zero(a.s) && float64_is_zero(b.s));
1150     }
1151 }
1152 
1153 static float32 float32_addsub(float32 a, float32 b, float_status *s,
1154                               hard_f32_op2_fn hard, soft_f32_op2_fn soft)
1155 {
1156     return float32_gen2(a, b, s, hard, soft,
1157                         f32_is_zon2, f32_addsub_post, NULL, NULL);
1158 }
1159 
1160 static float64 float64_addsub(float64 a, float64 b, float_status *s,
1161                               hard_f64_op2_fn hard, soft_f64_op2_fn soft)
1162 {
1163     return float64_gen2(a, b, s, hard, soft,
1164                         f64_is_zon2, f64_addsub_post, NULL, NULL);
1165 }
1166 
1167 float32 QEMU_FLATTEN
1168 float32_add(float32 a, float32 b, float_status *s)
1169 {
1170     return float32_addsub(a, b, s, hard_f32_add, soft_f32_add);
1171 }
1172 
1173 float32 QEMU_FLATTEN
1174 float32_sub(float32 a, float32 b, float_status *s)
1175 {
1176     return float32_addsub(a, b, s, hard_f32_sub, soft_f32_sub);
1177 }
1178 
1179 float64 QEMU_FLATTEN
1180 float64_add(float64 a, float64 b, float_status *s)
1181 {
1182     return float64_addsub(a, b, s, hard_f64_add, soft_f64_add);
1183 }
1184 
1185 float64 QEMU_FLATTEN
1186 float64_sub(float64 a, float64 b, float_status *s)
1187 {
1188     return float64_addsub(a, b, s, hard_f64_sub, soft_f64_sub);
1189 }
1190 
1191 /*
1192  * Returns the result of multiplying the floating-point values `a' and
1193  * `b'. The operation is performed according to the IEC/IEEE Standard
1194  * for Binary Floating-Point Arithmetic.
1195  */
1196 
1197 static FloatParts mul_floats(FloatParts a, FloatParts b, float_status *s)
1198 {
1199     bool sign = a.sign ^ b.sign;
1200 
1201     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1202         uint64_t hi, lo;
1203         int exp = a.exp + b.exp;
1204 
1205         mul64To128(a.frac, b.frac, &hi, &lo);
1206         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1207         if (lo & DECOMPOSED_OVERFLOW_BIT) {
1208             shift64RightJamming(lo, 1, &lo);
1209             exp += 1;
1210         }
1211 
1212         /* Re-use a */
1213         a.exp = exp;
1214         a.sign = sign;
1215         a.frac = lo;
1216         return a;
1217     }
1218     /* handle all the NaN cases */
1219     if (is_nan(a.cls) || is_nan(b.cls)) {
1220         return pick_nan(a, b, s);
1221     }
1222     /* Inf * Zero == NaN */
1223     if ((a.cls == float_class_inf && b.cls == float_class_zero) ||
1224         (a.cls == float_class_zero && b.cls == float_class_inf)) {
1225         s->float_exception_flags |= float_flag_invalid;
1226         return parts_default_nan(s);
1227     }
1228     /* Multiply by 0 or Inf */
1229     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1230         a.sign = sign;
1231         return a;
1232     }
1233     if (b.cls == float_class_inf || b.cls == float_class_zero) {
1234         b.sign = sign;
1235         return b;
1236     }
1237     g_assert_not_reached();
1238 }
1239 
1240 float16 QEMU_FLATTEN float16_mul(float16 a, float16 b, float_status *status)
1241 {
1242     FloatParts pa = float16_unpack_canonical(a, status);
1243     FloatParts pb = float16_unpack_canonical(b, status);
1244     FloatParts pr = mul_floats(pa, pb, status);
1245 
1246     return float16_round_pack_canonical(pr, status);
1247 }
1248 
1249 static float32 QEMU_SOFTFLOAT_ATTR
1250 soft_f32_mul(float32 a, float32 b, float_status *status)
1251 {
1252     FloatParts pa = float32_unpack_canonical(a, status);
1253     FloatParts pb = float32_unpack_canonical(b, status);
1254     FloatParts pr = mul_floats(pa, pb, status);
1255 
1256     return float32_round_pack_canonical(pr, status);
1257 }
1258 
1259 static float64 QEMU_SOFTFLOAT_ATTR
1260 soft_f64_mul(float64 a, float64 b, float_status *status)
1261 {
1262     FloatParts pa = float64_unpack_canonical(a, status);
1263     FloatParts pb = float64_unpack_canonical(b, status);
1264     FloatParts pr = mul_floats(pa, pb, status);
1265 
1266     return float64_round_pack_canonical(pr, status);
1267 }
1268 
1269 static float hard_f32_mul(float a, float b)
1270 {
1271     return a * b;
1272 }
1273 
1274 static double hard_f64_mul(double a, double b)
1275 {
1276     return a * b;
1277 }
1278 
1279 static bool f32_mul_fast_test(union_float32 a, union_float32 b)
1280 {
1281     return float32_is_zero(a.s) || float32_is_zero(b.s);
1282 }
1283 
1284 static bool f64_mul_fast_test(union_float64 a, union_float64 b)
1285 {
1286     return float64_is_zero(a.s) || float64_is_zero(b.s);
1287 }
1288 
1289 static float32 f32_mul_fast_op(float32 a, float32 b, float_status *s)
1290 {
1291     bool signbit = float32_is_neg(a) ^ float32_is_neg(b);
1292 
1293     return float32_set_sign(float32_zero, signbit);
1294 }
1295 
1296 static float64 f64_mul_fast_op(float64 a, float64 b, float_status *s)
1297 {
1298     bool signbit = float64_is_neg(a) ^ float64_is_neg(b);
1299 
1300     return float64_set_sign(float64_zero, signbit);
1301 }
1302 
1303 float32 QEMU_FLATTEN
1304 float32_mul(float32 a, float32 b, float_status *s)
1305 {
1306     return float32_gen2(a, b, s, hard_f32_mul, soft_f32_mul,
1307                         f32_is_zon2, NULL, f32_mul_fast_test, f32_mul_fast_op);
1308 }
1309 
1310 float64 QEMU_FLATTEN
1311 float64_mul(float64 a, float64 b, float_status *s)
1312 {
1313     return float64_gen2(a, b, s, hard_f64_mul, soft_f64_mul,
1314                         f64_is_zon2, NULL, f64_mul_fast_test, f64_mul_fast_op);
1315 }
1316 
1317 /*
1318  * Returns the result of multiplying the floating-point values `a' and
1319  * `b' then adding 'c', with no intermediate rounding step after the
1320  * multiplication. The operation is performed according to the
1321  * IEC/IEEE Standard for Binary Floating-Point Arithmetic 754-2008.
1322  * The flags argument allows the caller to select negation of the
1323  * addend, the intermediate product, or the final result. (The
1324  * difference between this and having the caller do a separate
1325  * negation is that negating externally will flip the sign bit on
1326  * NaNs.)
1327  */
1328 
1329 static FloatParts muladd_floats(FloatParts a, FloatParts b, FloatParts c,
1330                                 int flags, float_status *s)
1331 {
1332     bool inf_zero = ((1 << a.cls) | (1 << b.cls)) ==
1333                     ((1 << float_class_inf) | (1 << float_class_zero));
1334     bool p_sign;
1335     bool sign_flip = flags & float_muladd_negate_result;
1336     FloatClass p_class;
1337     uint64_t hi, lo;
1338     int p_exp;
1339 
1340     /* It is implementation-defined whether the cases of (0,inf,qnan)
1341      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
1342      * they return if they do), so we have to hand this information
1343      * off to the target-specific pick-a-NaN routine.
1344      */
1345     if (is_nan(a.cls) || is_nan(b.cls) || is_nan(c.cls)) {
1346         return pick_nan_muladd(a, b, c, inf_zero, s);
1347     }
1348 
1349     if (inf_zero) {
1350         s->float_exception_flags |= float_flag_invalid;
1351         return parts_default_nan(s);
1352     }
1353 
1354     if (flags & float_muladd_negate_c) {
1355         c.sign ^= 1;
1356     }
1357 
1358     p_sign = a.sign ^ b.sign;
1359 
1360     if (flags & float_muladd_negate_product) {
1361         p_sign ^= 1;
1362     }
1363 
1364     if (a.cls == float_class_inf || b.cls == float_class_inf) {
1365         p_class = float_class_inf;
1366     } else if (a.cls == float_class_zero || b.cls == float_class_zero) {
1367         p_class = float_class_zero;
1368     } else {
1369         p_class = float_class_normal;
1370     }
1371 
1372     if (c.cls == float_class_inf) {
1373         if (p_class == float_class_inf && p_sign != c.sign) {
1374             s->float_exception_flags |= float_flag_invalid;
1375             return parts_default_nan(s);
1376         } else {
1377             a.cls = float_class_inf;
1378             a.sign = c.sign ^ sign_flip;
1379             return a;
1380         }
1381     }
1382 
1383     if (p_class == float_class_inf) {
1384         a.cls = float_class_inf;
1385         a.sign = p_sign ^ sign_flip;
1386         return a;
1387     }
1388 
1389     if (p_class == float_class_zero) {
1390         if (c.cls == float_class_zero) {
1391             if (p_sign != c.sign) {
1392                 p_sign = s->float_rounding_mode == float_round_down;
1393             }
1394             c.sign = p_sign;
1395         } else if (flags & float_muladd_halve_result) {
1396             c.exp -= 1;
1397         }
1398         c.sign ^= sign_flip;
1399         return c;
1400     }
1401 
1402     /* a & b should be normals now... */
1403     assert(a.cls == float_class_normal &&
1404            b.cls == float_class_normal);
1405 
1406     p_exp = a.exp + b.exp;
1407 
1408     /* Multiply of 2 62-bit numbers produces a (2*62) == 124-bit
1409      * result.
1410      */
1411     mul64To128(a.frac, b.frac, &hi, &lo);
1412     /* binary point now at bit 124 */
1413 
1414     /* check for overflow */
1415     if (hi & (1ULL << (DECOMPOSED_BINARY_POINT * 2 + 1 - 64))) {
1416         shift128RightJamming(hi, lo, 1, &hi, &lo);
1417         p_exp += 1;
1418     }
1419 
1420     /* + add/sub */
1421     if (c.cls == float_class_zero) {
1422         /* move binary point back to 62 */
1423         shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1424     } else {
1425         int exp_diff = p_exp - c.exp;
1426         if (p_sign == c.sign) {
1427             /* Addition */
1428             if (exp_diff <= 0) {
1429                 shift128RightJamming(hi, lo,
1430                                      DECOMPOSED_BINARY_POINT - exp_diff,
1431                                      &hi, &lo);
1432                 lo += c.frac;
1433                 p_exp = c.exp;
1434             } else {
1435                 uint64_t c_hi, c_lo;
1436                 /* shift c to the same binary point as the product (124) */
1437                 c_hi = c.frac >> 2;
1438                 c_lo = 0;
1439                 shift128RightJamming(c_hi, c_lo,
1440                                      exp_diff,
1441                                      &c_hi, &c_lo);
1442                 add128(hi, lo, c_hi, c_lo, &hi, &lo);
1443                 /* move binary point back to 62 */
1444                 shift128RightJamming(hi, lo, DECOMPOSED_BINARY_POINT, &hi, &lo);
1445             }
1446 
1447             if (lo & DECOMPOSED_OVERFLOW_BIT) {
1448                 shift64RightJamming(lo, 1, &lo);
1449                 p_exp += 1;
1450             }
1451 
1452         } else {
1453             /* Subtraction */
1454             uint64_t c_hi, c_lo;
1455             /* make C binary point match product at bit 124 */
1456             c_hi = c.frac >> 2;
1457             c_lo = 0;
1458 
1459             if (exp_diff <= 0) {
1460                 shift128RightJamming(hi, lo, -exp_diff, &hi, &lo);
1461                 if (exp_diff == 0
1462                     &&
1463                     (hi > c_hi || (hi == c_hi && lo >= c_lo))) {
1464                     sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1465                 } else {
1466                     sub128(c_hi, c_lo, hi, lo, &hi, &lo);
1467                     p_sign ^= 1;
1468                     p_exp = c.exp;
1469                 }
1470             } else {
1471                 shift128RightJamming(c_hi, c_lo,
1472                                      exp_diff,
1473                                      &c_hi, &c_lo);
1474                 sub128(hi, lo, c_hi, c_lo, &hi, &lo);
1475             }
1476 
1477             if (hi == 0 && lo == 0) {
1478                 a.cls = float_class_zero;
1479                 a.sign = s->float_rounding_mode == float_round_down;
1480                 a.sign ^= sign_flip;
1481                 return a;
1482             } else {
1483                 int shift;
1484                 if (hi != 0) {
1485                     shift = clz64(hi);
1486                 } else {
1487                     shift = clz64(lo) + 64;
1488                 }
1489                 /* Normalizing to a binary point of 124 is the
1490                    correct adjust for the exponent.  However since we're
1491                    shifting, we might as well put the binary point back
1492                    at 62 where we really want it.  Therefore shift as
1493                    if we're leaving 1 bit at the top of the word, but
1494                    adjust the exponent as if we're leaving 3 bits.  */
1495                 shift -= 1;
1496                 if (shift >= 64) {
1497                     lo = lo << (shift - 64);
1498                 } else {
1499                     hi = (hi << shift) | (lo >> (64 - shift));
1500                     lo = hi | ((lo << shift) != 0);
1501                 }
1502                 p_exp -= shift - 2;
1503             }
1504         }
1505     }
1506 
1507     if (flags & float_muladd_halve_result) {
1508         p_exp -= 1;
1509     }
1510 
1511     /* finally prepare our result */
1512     a.cls = float_class_normal;
1513     a.sign = p_sign ^ sign_flip;
1514     a.exp = p_exp;
1515     a.frac = lo;
1516 
1517     return a;
1518 }
1519 
1520 float16 QEMU_FLATTEN float16_muladd(float16 a, float16 b, float16 c,
1521                                                 int flags, float_status *status)
1522 {
1523     FloatParts pa = float16_unpack_canonical(a, status);
1524     FloatParts pb = float16_unpack_canonical(b, status);
1525     FloatParts pc = float16_unpack_canonical(c, status);
1526     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1527 
1528     return float16_round_pack_canonical(pr, status);
1529 }
1530 
1531 static float32 QEMU_SOFTFLOAT_ATTR
1532 soft_f32_muladd(float32 a, float32 b, float32 c, int flags,
1533                 float_status *status)
1534 {
1535     FloatParts pa = float32_unpack_canonical(a, status);
1536     FloatParts pb = float32_unpack_canonical(b, status);
1537     FloatParts pc = float32_unpack_canonical(c, status);
1538     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1539 
1540     return float32_round_pack_canonical(pr, status);
1541 }
1542 
1543 static float64 QEMU_SOFTFLOAT_ATTR
1544 soft_f64_muladd(float64 a, float64 b, float64 c, int flags,
1545                 float_status *status)
1546 {
1547     FloatParts pa = float64_unpack_canonical(a, status);
1548     FloatParts pb = float64_unpack_canonical(b, status);
1549     FloatParts pc = float64_unpack_canonical(c, status);
1550     FloatParts pr = muladd_floats(pa, pb, pc, flags, status);
1551 
1552     return float64_round_pack_canonical(pr, status);
1553 }
1554 
1555 static bool force_soft_fma;
1556 
1557 float32 QEMU_FLATTEN
1558 float32_muladd(float32 xa, float32 xb, float32 xc, int flags, float_status *s)
1559 {
1560     union_float32 ua, ub, uc, ur;
1561 
1562     ua.s = xa;
1563     ub.s = xb;
1564     uc.s = xc;
1565 
1566     if (unlikely(!can_use_fpu(s))) {
1567         goto soft;
1568     }
1569     if (unlikely(flags & float_muladd_halve_result)) {
1570         goto soft;
1571     }
1572 
1573     float32_input_flush3(&ua.s, &ub.s, &uc.s, s);
1574     if (unlikely(!f32_is_zon3(ua, ub, uc))) {
1575         goto soft;
1576     }
1577 
1578     if (unlikely(force_soft_fma)) {
1579         goto soft;
1580     }
1581 
1582     /*
1583      * When (a || b) == 0, there's no need to check for under/over flow,
1584      * since we know the addend is (normal || 0) and the product is 0.
1585      */
1586     if (float32_is_zero(ua.s) || float32_is_zero(ub.s)) {
1587         union_float32 up;
1588         bool prod_sign;
1589 
1590         prod_sign = float32_is_neg(ua.s) ^ float32_is_neg(ub.s);
1591         prod_sign ^= !!(flags & float_muladd_negate_product);
1592         up.s = float32_set_sign(float32_zero, prod_sign);
1593 
1594         if (flags & float_muladd_negate_c) {
1595             uc.h = -uc.h;
1596         }
1597         ur.h = up.h + uc.h;
1598     } else {
1599         if (flags & float_muladd_negate_product) {
1600             ua.h = -ua.h;
1601         }
1602         if (flags & float_muladd_negate_c) {
1603             uc.h = -uc.h;
1604         }
1605 
1606         ur.h = fmaf(ua.h, ub.h, uc.h);
1607 
1608         if (unlikely(f32_is_inf(ur))) {
1609             s->float_exception_flags |= float_flag_overflow;
1610         } else if (unlikely(fabsf(ur.h) <= FLT_MIN)) {
1611             goto soft;
1612         }
1613     }
1614     if (flags & float_muladd_negate_result) {
1615         return float32_chs(ur.s);
1616     }
1617     return ur.s;
1618 
1619  soft:
1620     return soft_f32_muladd(ua.s, ub.s, uc.s, flags, s);
1621 }
1622 
1623 float64 QEMU_FLATTEN
1624 float64_muladd(float64 xa, float64 xb, float64 xc, int flags, float_status *s)
1625 {
1626     union_float64 ua, ub, uc, ur;
1627 
1628     ua.s = xa;
1629     ub.s = xb;
1630     uc.s = xc;
1631 
1632     if (unlikely(!can_use_fpu(s))) {
1633         goto soft;
1634     }
1635     if (unlikely(flags & float_muladd_halve_result)) {
1636         goto soft;
1637     }
1638 
1639     float64_input_flush3(&ua.s, &ub.s, &uc.s, s);
1640     if (unlikely(!f64_is_zon3(ua, ub, uc))) {
1641         goto soft;
1642     }
1643 
1644     if (unlikely(force_soft_fma)) {
1645         goto soft;
1646     }
1647 
1648     /*
1649      * When (a || b) == 0, there's no need to check for under/over flow,
1650      * since we know the addend is (normal || 0) and the product is 0.
1651      */
1652     if (float64_is_zero(ua.s) || float64_is_zero(ub.s)) {
1653         union_float64 up;
1654         bool prod_sign;
1655 
1656         prod_sign = float64_is_neg(ua.s) ^ float64_is_neg(ub.s);
1657         prod_sign ^= !!(flags & float_muladd_negate_product);
1658         up.s = float64_set_sign(float64_zero, prod_sign);
1659 
1660         if (flags & float_muladd_negate_c) {
1661             uc.h = -uc.h;
1662         }
1663         ur.h = up.h + uc.h;
1664     } else {
1665         if (flags & float_muladd_negate_product) {
1666             ua.h = -ua.h;
1667         }
1668         if (flags & float_muladd_negate_c) {
1669             uc.h = -uc.h;
1670         }
1671 
1672         ur.h = fma(ua.h, ub.h, uc.h);
1673 
1674         if (unlikely(f64_is_inf(ur))) {
1675             s->float_exception_flags |= float_flag_overflow;
1676         } else if (unlikely(fabs(ur.h) <= FLT_MIN)) {
1677             goto soft;
1678         }
1679     }
1680     if (flags & float_muladd_negate_result) {
1681         return float64_chs(ur.s);
1682     }
1683     return ur.s;
1684 
1685  soft:
1686     return soft_f64_muladd(ua.s, ub.s, uc.s, flags, s);
1687 }
1688 
1689 /*
1690  * Returns the result of dividing the floating-point value `a' by the
1691  * corresponding value `b'. The operation is performed according to
1692  * the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1693  */
1694 
1695 static FloatParts div_floats(FloatParts a, FloatParts b, float_status *s)
1696 {
1697     bool sign = a.sign ^ b.sign;
1698 
1699     if (a.cls == float_class_normal && b.cls == float_class_normal) {
1700         uint64_t n0, n1, q, r;
1701         int exp = a.exp - b.exp;
1702 
1703         /*
1704          * We want a 2*N / N-bit division to produce exactly an N-bit
1705          * result, so that we do not lose any precision and so that we
1706          * do not have to renormalize afterward.  If A.frac < B.frac,
1707          * then division would produce an (N-1)-bit result; shift A left
1708          * by one to produce the an N-bit result, and decrement the
1709          * exponent to match.
1710          *
1711          * The udiv_qrnnd algorithm that we're using requires normalization,
1712          * i.e. the msb of the denominator must be set.  Since we know that
1713          * DECOMPOSED_BINARY_POINT is msb-1, the inputs must be shifted left
1714          * by one (more), and the remainder must be shifted right by one.
1715          */
1716         if (a.frac < b.frac) {
1717             exp -= 1;
1718             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 2, &n1, &n0);
1719         } else {
1720             shift128Left(0, a.frac, DECOMPOSED_BINARY_POINT + 1, &n1, &n0);
1721         }
1722         q = udiv_qrnnd(&r, n1, n0, b.frac << 1);
1723 
1724         /*
1725          * Set lsb if there is a remainder, to set inexact.
1726          * As mentioned above, to find the actual value of the remainder we
1727          * would need to shift right, but (1) we are only concerned about
1728          * non-zero-ness, and (2) the remainder will always be even because
1729          * both inputs to the division primitive are even.
1730          */
1731         a.frac = q | (r != 0);
1732         a.sign = sign;
1733         a.exp = exp;
1734         return a;
1735     }
1736     /* handle all the NaN cases */
1737     if (is_nan(a.cls) || is_nan(b.cls)) {
1738         return pick_nan(a, b, s);
1739     }
1740     /* 0/0 or Inf/Inf */
1741     if (a.cls == b.cls
1742         &&
1743         (a.cls == float_class_inf || a.cls == float_class_zero)) {
1744         s->float_exception_flags |= float_flag_invalid;
1745         return parts_default_nan(s);
1746     }
1747     /* Inf / x or 0 / x */
1748     if (a.cls == float_class_inf || a.cls == float_class_zero) {
1749         a.sign = sign;
1750         return a;
1751     }
1752     /* Div 0 => Inf */
1753     if (b.cls == float_class_zero) {
1754         s->float_exception_flags |= float_flag_divbyzero;
1755         a.cls = float_class_inf;
1756         a.sign = sign;
1757         return a;
1758     }
1759     /* Div by Inf */
1760     if (b.cls == float_class_inf) {
1761         a.cls = float_class_zero;
1762         a.sign = sign;
1763         return a;
1764     }
1765     g_assert_not_reached();
1766 }
1767 
1768 float16 float16_div(float16 a, float16 b, float_status *status)
1769 {
1770     FloatParts pa = float16_unpack_canonical(a, status);
1771     FloatParts pb = float16_unpack_canonical(b, status);
1772     FloatParts pr = div_floats(pa, pb, status);
1773 
1774     return float16_round_pack_canonical(pr, status);
1775 }
1776 
1777 static float32 QEMU_SOFTFLOAT_ATTR
1778 soft_f32_div(float32 a, float32 b, float_status *status)
1779 {
1780     FloatParts pa = float32_unpack_canonical(a, status);
1781     FloatParts pb = float32_unpack_canonical(b, status);
1782     FloatParts pr = div_floats(pa, pb, status);
1783 
1784     return float32_round_pack_canonical(pr, status);
1785 }
1786 
1787 static float64 QEMU_SOFTFLOAT_ATTR
1788 soft_f64_div(float64 a, float64 b, float_status *status)
1789 {
1790     FloatParts pa = float64_unpack_canonical(a, status);
1791     FloatParts pb = float64_unpack_canonical(b, status);
1792     FloatParts pr = div_floats(pa, pb, status);
1793 
1794     return float64_round_pack_canonical(pr, status);
1795 }
1796 
1797 static float hard_f32_div(float a, float b)
1798 {
1799     return a / b;
1800 }
1801 
1802 static double hard_f64_div(double a, double b)
1803 {
1804     return a / b;
1805 }
1806 
1807 static bool f32_div_pre(union_float32 a, union_float32 b)
1808 {
1809     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1810         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1811                fpclassify(b.h) == FP_NORMAL;
1812     }
1813     return float32_is_zero_or_normal(a.s) && float32_is_normal(b.s);
1814 }
1815 
1816 static bool f64_div_pre(union_float64 a, union_float64 b)
1817 {
1818     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1819         return (fpclassify(a.h) == FP_NORMAL || fpclassify(a.h) == FP_ZERO) &&
1820                fpclassify(b.h) == FP_NORMAL;
1821     }
1822     return float64_is_zero_or_normal(a.s) && float64_is_normal(b.s);
1823 }
1824 
1825 static bool f32_div_post(union_float32 a, union_float32 b)
1826 {
1827     if (QEMU_HARDFLOAT_2F32_USE_FP) {
1828         return fpclassify(a.h) != FP_ZERO;
1829     }
1830     return !float32_is_zero(a.s);
1831 }
1832 
1833 static bool f64_div_post(union_float64 a, union_float64 b)
1834 {
1835     if (QEMU_HARDFLOAT_2F64_USE_FP) {
1836         return fpclassify(a.h) != FP_ZERO;
1837     }
1838     return !float64_is_zero(a.s);
1839 }
1840 
1841 float32 QEMU_FLATTEN
1842 float32_div(float32 a, float32 b, float_status *s)
1843 {
1844     return float32_gen2(a, b, s, hard_f32_div, soft_f32_div,
1845                         f32_div_pre, f32_div_post, NULL, NULL);
1846 }
1847 
1848 float64 QEMU_FLATTEN
1849 float64_div(float64 a, float64 b, float_status *s)
1850 {
1851     return float64_gen2(a, b, s, hard_f64_div, soft_f64_div,
1852                         f64_div_pre, f64_div_post, NULL, NULL);
1853 }
1854 
1855 /*
1856  * Float to Float conversions
1857  *
1858  * Returns the result of converting one float format to another. The
1859  * conversion is performed according to the IEC/IEEE Standard for
1860  * Binary Floating-Point Arithmetic.
1861  *
1862  * The float_to_float helper only needs to take care of raising
1863  * invalid exceptions and handling the conversion on NaNs.
1864  */
1865 
1866 static FloatParts float_to_float(FloatParts a, const FloatFmt *dstf,
1867                                  float_status *s)
1868 {
1869     if (dstf->arm_althp) {
1870         switch (a.cls) {
1871         case float_class_qnan:
1872         case float_class_snan:
1873             /* There is no NaN in the destination format.  Raise Invalid
1874              * and return a zero with the sign of the input NaN.
1875              */
1876             s->float_exception_flags |= float_flag_invalid;
1877             a.cls = float_class_zero;
1878             a.frac = 0;
1879             a.exp = 0;
1880             break;
1881 
1882         case float_class_inf:
1883             /* There is no Inf in the destination format.  Raise Invalid
1884              * and return the maximum normal with the correct sign.
1885              */
1886             s->float_exception_flags |= float_flag_invalid;
1887             a.cls = float_class_normal;
1888             a.exp = dstf->exp_max;
1889             a.frac = ((1ull << dstf->frac_size) - 1) << dstf->frac_shift;
1890             break;
1891 
1892         default:
1893             break;
1894         }
1895     } else if (is_nan(a.cls)) {
1896         if (is_snan(a.cls)) {
1897             s->float_exception_flags |= float_flag_invalid;
1898             a = parts_silence_nan(a, s);
1899         }
1900         if (s->default_nan_mode) {
1901             return parts_default_nan(s);
1902         }
1903     }
1904     return a;
1905 }
1906 
1907 float32 float16_to_float32(float16 a, bool ieee, float_status *s)
1908 {
1909     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1910     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1911     FloatParts pr = float_to_float(p, &float32_params, s);
1912     return float32_round_pack_canonical(pr, s);
1913 }
1914 
1915 float64 float16_to_float64(float16 a, bool ieee, float_status *s)
1916 {
1917     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1918     FloatParts p = float16a_unpack_canonical(a, s, fmt16);
1919     FloatParts pr = float_to_float(p, &float64_params, s);
1920     return float64_round_pack_canonical(pr, s);
1921 }
1922 
1923 float16 float32_to_float16(float32 a, bool ieee, float_status *s)
1924 {
1925     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1926     FloatParts p = float32_unpack_canonical(a, s);
1927     FloatParts pr = float_to_float(p, fmt16, s);
1928     return float16a_round_pack_canonical(pr, s, fmt16);
1929 }
1930 
1931 float64 float32_to_float64(float32 a, float_status *s)
1932 {
1933     FloatParts p = float32_unpack_canonical(a, s);
1934     FloatParts pr = float_to_float(p, &float64_params, s);
1935     return float64_round_pack_canonical(pr, s);
1936 }
1937 
1938 float16 float64_to_float16(float64 a, bool ieee, float_status *s)
1939 {
1940     const FloatFmt *fmt16 = ieee ? &float16_params : &float16_params_ahp;
1941     FloatParts p = float64_unpack_canonical(a, s);
1942     FloatParts pr = float_to_float(p, fmt16, s);
1943     return float16a_round_pack_canonical(pr, s, fmt16);
1944 }
1945 
1946 float32 float64_to_float32(float64 a, float_status *s)
1947 {
1948     FloatParts p = float64_unpack_canonical(a, s);
1949     FloatParts pr = float_to_float(p, &float32_params, s);
1950     return float32_round_pack_canonical(pr, s);
1951 }
1952 
1953 /*
1954  * Rounds the floating-point value `a' to an integer, and returns the
1955  * result as a floating-point value. The operation is performed
1956  * according to the IEC/IEEE Standard for Binary Floating-Point
1957  * Arithmetic.
1958  */
1959 
1960 static FloatParts round_to_int(FloatParts a, int rmode,
1961                                int scale, float_status *s)
1962 {
1963     switch (a.cls) {
1964     case float_class_qnan:
1965     case float_class_snan:
1966         return return_nan(a, s);
1967 
1968     case float_class_zero:
1969     case float_class_inf:
1970         /* already "integral" */
1971         break;
1972 
1973     case float_class_normal:
1974         scale = MIN(MAX(scale, -0x10000), 0x10000);
1975         a.exp += scale;
1976 
1977         if (a.exp >= DECOMPOSED_BINARY_POINT) {
1978             /* already integral */
1979             break;
1980         }
1981         if (a.exp < 0) {
1982             bool one;
1983             /* all fractional */
1984             s->float_exception_flags |= float_flag_inexact;
1985             switch (rmode) {
1986             case float_round_nearest_even:
1987                 one = a.exp == -1 && a.frac > DECOMPOSED_IMPLICIT_BIT;
1988                 break;
1989             case float_round_ties_away:
1990                 one = a.exp == -1 && a.frac >= DECOMPOSED_IMPLICIT_BIT;
1991                 break;
1992             case float_round_to_zero:
1993                 one = false;
1994                 break;
1995             case float_round_up:
1996                 one = !a.sign;
1997                 break;
1998             case float_round_down:
1999                 one = a.sign;
2000                 break;
2001             case float_round_to_odd:
2002                 one = true;
2003                 break;
2004             default:
2005                 g_assert_not_reached();
2006             }
2007 
2008             if (one) {
2009                 a.frac = DECOMPOSED_IMPLICIT_BIT;
2010                 a.exp = 0;
2011             } else {
2012                 a.cls = float_class_zero;
2013             }
2014         } else {
2015             uint64_t frac_lsb = DECOMPOSED_IMPLICIT_BIT >> a.exp;
2016             uint64_t frac_lsbm1 = frac_lsb >> 1;
2017             uint64_t rnd_even_mask = (frac_lsb - 1) | frac_lsb;
2018             uint64_t rnd_mask = rnd_even_mask >> 1;
2019             uint64_t inc;
2020 
2021             switch (rmode) {
2022             case float_round_nearest_even:
2023                 inc = ((a.frac & rnd_even_mask) != frac_lsbm1 ? frac_lsbm1 : 0);
2024                 break;
2025             case float_round_ties_away:
2026                 inc = frac_lsbm1;
2027                 break;
2028             case float_round_to_zero:
2029                 inc = 0;
2030                 break;
2031             case float_round_up:
2032                 inc = a.sign ? 0 : rnd_mask;
2033                 break;
2034             case float_round_down:
2035                 inc = a.sign ? rnd_mask : 0;
2036                 break;
2037             case float_round_to_odd:
2038                 inc = a.frac & frac_lsb ? 0 : rnd_mask;
2039                 break;
2040             default:
2041                 g_assert_not_reached();
2042             }
2043 
2044             if (a.frac & rnd_mask) {
2045                 s->float_exception_flags |= float_flag_inexact;
2046                 a.frac += inc;
2047                 a.frac &= ~rnd_mask;
2048                 if (a.frac & DECOMPOSED_OVERFLOW_BIT) {
2049                     a.frac >>= 1;
2050                     a.exp++;
2051                 }
2052             }
2053         }
2054         break;
2055     default:
2056         g_assert_not_reached();
2057     }
2058     return a;
2059 }
2060 
2061 float16 float16_round_to_int(float16 a, float_status *s)
2062 {
2063     FloatParts pa = float16_unpack_canonical(a, s);
2064     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2065     return float16_round_pack_canonical(pr, s);
2066 }
2067 
2068 float32 float32_round_to_int(float32 a, float_status *s)
2069 {
2070     FloatParts pa = float32_unpack_canonical(a, s);
2071     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2072     return float32_round_pack_canonical(pr, s);
2073 }
2074 
2075 float64 float64_round_to_int(float64 a, float_status *s)
2076 {
2077     FloatParts pa = float64_unpack_canonical(a, s);
2078     FloatParts pr = round_to_int(pa, s->float_rounding_mode, 0, s);
2079     return float64_round_pack_canonical(pr, s);
2080 }
2081 
2082 /*
2083  * Returns the result of converting the floating-point value `a' to
2084  * the two's complement integer format. The conversion is performed
2085  * according to the IEC/IEEE Standard for Binary Floating-Point
2086  * Arithmetic---which means in particular that the conversion is
2087  * rounded according to the current rounding mode. If `a' is a NaN,
2088  * the largest positive integer is returned. Otherwise, if the
2089  * conversion overflows, the largest integer with the same sign as `a'
2090  * is returned.
2091 */
2092 
2093 static int64_t round_to_int_and_pack(FloatParts in, int rmode, int scale,
2094                                      int64_t min, int64_t max,
2095                                      float_status *s)
2096 {
2097     uint64_t r;
2098     int orig_flags = get_float_exception_flags(s);
2099     FloatParts p = round_to_int(in, rmode, scale, s);
2100 
2101     switch (p.cls) {
2102     case float_class_snan:
2103     case float_class_qnan:
2104         s->float_exception_flags = orig_flags | float_flag_invalid;
2105         return max;
2106     case float_class_inf:
2107         s->float_exception_flags = orig_flags | float_flag_invalid;
2108         return p.sign ? min : max;
2109     case float_class_zero:
2110         return 0;
2111     case float_class_normal:
2112         if (p.exp < DECOMPOSED_BINARY_POINT) {
2113             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2114         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2115             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2116         } else {
2117             r = UINT64_MAX;
2118         }
2119         if (p.sign) {
2120             if (r <= -(uint64_t) min) {
2121                 return -r;
2122             } else {
2123                 s->float_exception_flags = orig_flags | float_flag_invalid;
2124                 return min;
2125             }
2126         } else {
2127             if (r <= max) {
2128                 return r;
2129             } else {
2130                 s->float_exception_flags = orig_flags | float_flag_invalid;
2131                 return max;
2132             }
2133         }
2134     default:
2135         g_assert_not_reached();
2136     }
2137 }
2138 
2139 int16_t float16_to_int16_scalbn(float16 a, int rmode, int scale,
2140                                 float_status *s)
2141 {
2142     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2143                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2144 }
2145 
2146 int32_t float16_to_int32_scalbn(float16 a, int rmode, int scale,
2147                                 float_status *s)
2148 {
2149     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2150                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2151 }
2152 
2153 int64_t float16_to_int64_scalbn(float16 a, int rmode, int scale,
2154                                 float_status *s)
2155 {
2156     return round_to_int_and_pack(float16_unpack_canonical(a, s),
2157                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2158 }
2159 
2160 int16_t float32_to_int16_scalbn(float32 a, int rmode, int scale,
2161                                 float_status *s)
2162 {
2163     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2164                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2165 }
2166 
2167 int32_t float32_to_int32_scalbn(float32 a, int rmode, int scale,
2168                                 float_status *s)
2169 {
2170     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2171                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2172 }
2173 
2174 int64_t float32_to_int64_scalbn(float32 a, int rmode, int scale,
2175                                 float_status *s)
2176 {
2177     return round_to_int_and_pack(float32_unpack_canonical(a, s),
2178                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2179 }
2180 
2181 int16_t float64_to_int16_scalbn(float64 a, int rmode, int scale,
2182                                 float_status *s)
2183 {
2184     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2185                                  rmode, scale, INT16_MIN, INT16_MAX, s);
2186 }
2187 
2188 int32_t float64_to_int32_scalbn(float64 a, int rmode, int scale,
2189                                 float_status *s)
2190 {
2191     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2192                                  rmode, scale, INT32_MIN, INT32_MAX, s);
2193 }
2194 
2195 int64_t float64_to_int64_scalbn(float64 a, int rmode, int scale,
2196                                 float_status *s)
2197 {
2198     return round_to_int_and_pack(float64_unpack_canonical(a, s),
2199                                  rmode, scale, INT64_MIN, INT64_MAX, s);
2200 }
2201 
2202 int16_t float16_to_int16(float16 a, float_status *s)
2203 {
2204     return float16_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2205 }
2206 
2207 int32_t float16_to_int32(float16 a, float_status *s)
2208 {
2209     return float16_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2210 }
2211 
2212 int64_t float16_to_int64(float16 a, float_status *s)
2213 {
2214     return float16_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2215 }
2216 
2217 int16_t float32_to_int16(float32 a, float_status *s)
2218 {
2219     return float32_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2220 }
2221 
2222 int32_t float32_to_int32(float32 a, float_status *s)
2223 {
2224     return float32_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2225 }
2226 
2227 int64_t float32_to_int64(float32 a, float_status *s)
2228 {
2229     return float32_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2230 }
2231 
2232 int16_t float64_to_int16(float64 a, float_status *s)
2233 {
2234     return float64_to_int16_scalbn(a, s->float_rounding_mode, 0, s);
2235 }
2236 
2237 int32_t float64_to_int32(float64 a, float_status *s)
2238 {
2239     return float64_to_int32_scalbn(a, s->float_rounding_mode, 0, s);
2240 }
2241 
2242 int64_t float64_to_int64(float64 a, float_status *s)
2243 {
2244     return float64_to_int64_scalbn(a, s->float_rounding_mode, 0, s);
2245 }
2246 
2247 int16_t float16_to_int16_round_to_zero(float16 a, float_status *s)
2248 {
2249     return float16_to_int16_scalbn(a, float_round_to_zero, 0, s);
2250 }
2251 
2252 int32_t float16_to_int32_round_to_zero(float16 a, float_status *s)
2253 {
2254     return float16_to_int32_scalbn(a, float_round_to_zero, 0, s);
2255 }
2256 
2257 int64_t float16_to_int64_round_to_zero(float16 a, float_status *s)
2258 {
2259     return float16_to_int64_scalbn(a, float_round_to_zero, 0, s);
2260 }
2261 
2262 int16_t float32_to_int16_round_to_zero(float32 a, float_status *s)
2263 {
2264     return float32_to_int16_scalbn(a, float_round_to_zero, 0, s);
2265 }
2266 
2267 int32_t float32_to_int32_round_to_zero(float32 a, float_status *s)
2268 {
2269     return float32_to_int32_scalbn(a, float_round_to_zero, 0, s);
2270 }
2271 
2272 int64_t float32_to_int64_round_to_zero(float32 a, float_status *s)
2273 {
2274     return float32_to_int64_scalbn(a, float_round_to_zero, 0, s);
2275 }
2276 
2277 int16_t float64_to_int16_round_to_zero(float64 a, float_status *s)
2278 {
2279     return float64_to_int16_scalbn(a, float_round_to_zero, 0, s);
2280 }
2281 
2282 int32_t float64_to_int32_round_to_zero(float64 a, float_status *s)
2283 {
2284     return float64_to_int32_scalbn(a, float_round_to_zero, 0, s);
2285 }
2286 
2287 int64_t float64_to_int64_round_to_zero(float64 a, float_status *s)
2288 {
2289     return float64_to_int64_scalbn(a, float_round_to_zero, 0, s);
2290 }
2291 
2292 /*
2293  *  Returns the result of converting the floating-point value `a' to
2294  *  the unsigned integer format. The conversion is performed according
2295  *  to the IEC/IEEE Standard for Binary Floating-Point
2296  *  Arithmetic---which means in particular that the conversion is
2297  *  rounded according to the current rounding mode. If `a' is a NaN,
2298  *  the largest unsigned integer is returned. Otherwise, if the
2299  *  conversion overflows, the largest unsigned integer is returned. If
2300  *  the 'a' is negative, the result is rounded and zero is returned;
2301  *  values that do not round to zero will raise the inexact exception
2302  *  flag.
2303  */
2304 
2305 static uint64_t round_to_uint_and_pack(FloatParts in, int rmode, int scale,
2306                                        uint64_t max, float_status *s)
2307 {
2308     int orig_flags = get_float_exception_flags(s);
2309     FloatParts p = round_to_int(in, rmode, scale, s);
2310     uint64_t r;
2311 
2312     switch (p.cls) {
2313     case float_class_snan:
2314     case float_class_qnan:
2315         s->float_exception_flags = orig_flags | float_flag_invalid;
2316         return max;
2317     case float_class_inf:
2318         s->float_exception_flags = orig_flags | float_flag_invalid;
2319         return p.sign ? 0 : max;
2320     case float_class_zero:
2321         return 0;
2322     case float_class_normal:
2323         if (p.sign) {
2324             s->float_exception_flags = orig_flags | float_flag_invalid;
2325             return 0;
2326         }
2327 
2328         if (p.exp < DECOMPOSED_BINARY_POINT) {
2329             r = p.frac >> (DECOMPOSED_BINARY_POINT - p.exp);
2330         } else if (p.exp - DECOMPOSED_BINARY_POINT < 2) {
2331             r = p.frac << (p.exp - DECOMPOSED_BINARY_POINT);
2332         } else {
2333             s->float_exception_flags = orig_flags | float_flag_invalid;
2334             return max;
2335         }
2336 
2337         /* For uint64 this will never trip, but if p.exp is too large
2338          * to shift a decomposed fraction we shall have exited via the
2339          * 3rd leg above.
2340          */
2341         if (r > max) {
2342             s->float_exception_flags = orig_flags | float_flag_invalid;
2343             return max;
2344         }
2345         return r;
2346     default:
2347         g_assert_not_reached();
2348     }
2349 }
2350 
2351 uint16_t float16_to_uint16_scalbn(float16 a, int rmode, int scale,
2352                                   float_status *s)
2353 {
2354     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2355                                   rmode, scale, UINT16_MAX, s);
2356 }
2357 
2358 uint32_t float16_to_uint32_scalbn(float16 a, int rmode, int scale,
2359                                   float_status *s)
2360 {
2361     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2362                                   rmode, scale, UINT32_MAX, s);
2363 }
2364 
2365 uint64_t float16_to_uint64_scalbn(float16 a, int rmode, int scale,
2366                                   float_status *s)
2367 {
2368     return round_to_uint_and_pack(float16_unpack_canonical(a, s),
2369                                   rmode, scale, UINT64_MAX, s);
2370 }
2371 
2372 uint16_t float32_to_uint16_scalbn(float32 a, int rmode, int scale,
2373                                   float_status *s)
2374 {
2375     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2376                                   rmode, scale, UINT16_MAX, s);
2377 }
2378 
2379 uint32_t float32_to_uint32_scalbn(float32 a, int rmode, int scale,
2380                                   float_status *s)
2381 {
2382     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2383                                   rmode, scale, UINT32_MAX, s);
2384 }
2385 
2386 uint64_t float32_to_uint64_scalbn(float32 a, int rmode, int scale,
2387                                   float_status *s)
2388 {
2389     return round_to_uint_and_pack(float32_unpack_canonical(a, s),
2390                                   rmode, scale, UINT64_MAX, s);
2391 }
2392 
2393 uint16_t float64_to_uint16_scalbn(float64 a, int rmode, int scale,
2394                                   float_status *s)
2395 {
2396     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2397                                   rmode, scale, UINT16_MAX, s);
2398 }
2399 
2400 uint32_t float64_to_uint32_scalbn(float64 a, int rmode, int scale,
2401                                   float_status *s)
2402 {
2403     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2404                                   rmode, scale, UINT32_MAX, s);
2405 }
2406 
2407 uint64_t float64_to_uint64_scalbn(float64 a, int rmode, int scale,
2408                                   float_status *s)
2409 {
2410     return round_to_uint_and_pack(float64_unpack_canonical(a, s),
2411                                   rmode, scale, UINT64_MAX, s);
2412 }
2413 
2414 uint16_t float16_to_uint16(float16 a, float_status *s)
2415 {
2416     return float16_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2417 }
2418 
2419 uint32_t float16_to_uint32(float16 a, float_status *s)
2420 {
2421     return float16_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2422 }
2423 
2424 uint64_t float16_to_uint64(float16 a, float_status *s)
2425 {
2426     return float16_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2427 }
2428 
2429 uint16_t float32_to_uint16(float32 a, float_status *s)
2430 {
2431     return float32_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2432 }
2433 
2434 uint32_t float32_to_uint32(float32 a, float_status *s)
2435 {
2436     return float32_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2437 }
2438 
2439 uint64_t float32_to_uint64(float32 a, float_status *s)
2440 {
2441     return float32_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2442 }
2443 
2444 uint16_t float64_to_uint16(float64 a, float_status *s)
2445 {
2446     return float64_to_uint16_scalbn(a, s->float_rounding_mode, 0, s);
2447 }
2448 
2449 uint32_t float64_to_uint32(float64 a, float_status *s)
2450 {
2451     return float64_to_uint32_scalbn(a, s->float_rounding_mode, 0, s);
2452 }
2453 
2454 uint64_t float64_to_uint64(float64 a, float_status *s)
2455 {
2456     return float64_to_uint64_scalbn(a, s->float_rounding_mode, 0, s);
2457 }
2458 
2459 uint16_t float16_to_uint16_round_to_zero(float16 a, float_status *s)
2460 {
2461     return float16_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2462 }
2463 
2464 uint32_t float16_to_uint32_round_to_zero(float16 a, float_status *s)
2465 {
2466     return float16_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2467 }
2468 
2469 uint64_t float16_to_uint64_round_to_zero(float16 a, float_status *s)
2470 {
2471     return float16_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2472 }
2473 
2474 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *s)
2475 {
2476     return float32_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2477 }
2478 
2479 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *s)
2480 {
2481     return float32_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2482 }
2483 
2484 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *s)
2485 {
2486     return float32_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2487 }
2488 
2489 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *s)
2490 {
2491     return float64_to_uint16_scalbn(a, float_round_to_zero, 0, s);
2492 }
2493 
2494 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *s)
2495 {
2496     return float64_to_uint32_scalbn(a, float_round_to_zero, 0, s);
2497 }
2498 
2499 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *s)
2500 {
2501     return float64_to_uint64_scalbn(a, float_round_to_zero, 0, s);
2502 }
2503 
2504 /*
2505  * Integer to float conversions
2506  *
2507  * Returns the result of converting the two's complement integer `a'
2508  * to the floating-point format. The conversion is performed according
2509  * to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2510  */
2511 
2512 static FloatParts int_to_float(int64_t a, int scale, float_status *status)
2513 {
2514     FloatParts r = { .sign = false };
2515 
2516     if (a == 0) {
2517         r.cls = float_class_zero;
2518     } else {
2519         uint64_t f = a;
2520         int shift;
2521 
2522         r.cls = float_class_normal;
2523         if (a < 0) {
2524             f = -f;
2525             r.sign = true;
2526         }
2527         shift = clz64(f) - 1;
2528         scale = MIN(MAX(scale, -0x10000), 0x10000);
2529 
2530         r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2531         r.frac = (shift < 0 ? DECOMPOSED_IMPLICIT_BIT : f << shift);
2532     }
2533 
2534     return r;
2535 }
2536 
2537 float16 int64_to_float16_scalbn(int64_t a, int scale, float_status *status)
2538 {
2539     FloatParts pa = int_to_float(a, scale, status);
2540     return float16_round_pack_canonical(pa, status);
2541 }
2542 
2543 float16 int32_to_float16_scalbn(int32_t a, int scale, float_status *status)
2544 {
2545     return int64_to_float16_scalbn(a, scale, status);
2546 }
2547 
2548 float16 int16_to_float16_scalbn(int16_t a, int scale, float_status *status)
2549 {
2550     return int64_to_float16_scalbn(a, scale, status);
2551 }
2552 
2553 float16 int64_to_float16(int64_t a, float_status *status)
2554 {
2555     return int64_to_float16_scalbn(a, 0, status);
2556 }
2557 
2558 float16 int32_to_float16(int32_t a, float_status *status)
2559 {
2560     return int64_to_float16_scalbn(a, 0, status);
2561 }
2562 
2563 float16 int16_to_float16(int16_t a, float_status *status)
2564 {
2565     return int64_to_float16_scalbn(a, 0, status);
2566 }
2567 
2568 float32 int64_to_float32_scalbn(int64_t a, int scale, float_status *status)
2569 {
2570     FloatParts pa = int_to_float(a, scale, status);
2571     return float32_round_pack_canonical(pa, status);
2572 }
2573 
2574 float32 int32_to_float32_scalbn(int32_t a, int scale, float_status *status)
2575 {
2576     return int64_to_float32_scalbn(a, scale, status);
2577 }
2578 
2579 float32 int16_to_float32_scalbn(int16_t a, int scale, float_status *status)
2580 {
2581     return int64_to_float32_scalbn(a, scale, status);
2582 }
2583 
2584 float32 int64_to_float32(int64_t a, float_status *status)
2585 {
2586     return int64_to_float32_scalbn(a, 0, status);
2587 }
2588 
2589 float32 int32_to_float32(int32_t a, float_status *status)
2590 {
2591     return int64_to_float32_scalbn(a, 0, status);
2592 }
2593 
2594 float32 int16_to_float32(int16_t a, float_status *status)
2595 {
2596     return int64_to_float32_scalbn(a, 0, status);
2597 }
2598 
2599 float64 int64_to_float64_scalbn(int64_t a, int scale, float_status *status)
2600 {
2601     FloatParts pa = int_to_float(a, scale, status);
2602     return float64_round_pack_canonical(pa, status);
2603 }
2604 
2605 float64 int32_to_float64_scalbn(int32_t a, int scale, float_status *status)
2606 {
2607     return int64_to_float64_scalbn(a, scale, status);
2608 }
2609 
2610 float64 int16_to_float64_scalbn(int16_t a, int scale, float_status *status)
2611 {
2612     return int64_to_float64_scalbn(a, scale, status);
2613 }
2614 
2615 float64 int64_to_float64(int64_t a, float_status *status)
2616 {
2617     return int64_to_float64_scalbn(a, 0, status);
2618 }
2619 
2620 float64 int32_to_float64(int32_t a, float_status *status)
2621 {
2622     return int64_to_float64_scalbn(a, 0, status);
2623 }
2624 
2625 float64 int16_to_float64(int16_t a, float_status *status)
2626 {
2627     return int64_to_float64_scalbn(a, 0, status);
2628 }
2629 
2630 
2631 /*
2632  * Unsigned Integer to float conversions
2633  *
2634  * Returns the result of converting the unsigned integer `a' to the
2635  * floating-point format. The conversion is performed according to the
2636  * IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2637  */
2638 
2639 static FloatParts uint_to_float(uint64_t a, int scale, float_status *status)
2640 {
2641     FloatParts r = { .sign = false };
2642 
2643     if (a == 0) {
2644         r.cls = float_class_zero;
2645     } else {
2646         scale = MIN(MAX(scale, -0x10000), 0x10000);
2647         r.cls = float_class_normal;
2648         if ((int64_t)a < 0) {
2649             r.exp = DECOMPOSED_BINARY_POINT + 1 + scale;
2650             shift64RightJamming(a, 1, &a);
2651             r.frac = a;
2652         } else {
2653             int shift = clz64(a) - 1;
2654             r.exp = DECOMPOSED_BINARY_POINT - shift + scale;
2655             r.frac = a << shift;
2656         }
2657     }
2658 
2659     return r;
2660 }
2661 
2662 float16 uint64_to_float16_scalbn(uint64_t a, int scale, float_status *status)
2663 {
2664     FloatParts pa = uint_to_float(a, scale, status);
2665     return float16_round_pack_canonical(pa, status);
2666 }
2667 
2668 float16 uint32_to_float16_scalbn(uint32_t a, int scale, float_status *status)
2669 {
2670     return uint64_to_float16_scalbn(a, scale, status);
2671 }
2672 
2673 float16 uint16_to_float16_scalbn(uint16_t a, int scale, float_status *status)
2674 {
2675     return uint64_to_float16_scalbn(a, scale, status);
2676 }
2677 
2678 float16 uint64_to_float16(uint64_t a, float_status *status)
2679 {
2680     return uint64_to_float16_scalbn(a, 0, status);
2681 }
2682 
2683 float16 uint32_to_float16(uint32_t a, float_status *status)
2684 {
2685     return uint64_to_float16_scalbn(a, 0, status);
2686 }
2687 
2688 float16 uint16_to_float16(uint16_t a, float_status *status)
2689 {
2690     return uint64_to_float16_scalbn(a, 0, status);
2691 }
2692 
2693 float32 uint64_to_float32_scalbn(uint64_t a, int scale, float_status *status)
2694 {
2695     FloatParts pa = uint_to_float(a, scale, status);
2696     return float32_round_pack_canonical(pa, status);
2697 }
2698 
2699 float32 uint32_to_float32_scalbn(uint32_t a, int scale, float_status *status)
2700 {
2701     return uint64_to_float32_scalbn(a, scale, status);
2702 }
2703 
2704 float32 uint16_to_float32_scalbn(uint16_t a, int scale, float_status *status)
2705 {
2706     return uint64_to_float32_scalbn(a, scale, status);
2707 }
2708 
2709 float32 uint64_to_float32(uint64_t a, float_status *status)
2710 {
2711     return uint64_to_float32_scalbn(a, 0, status);
2712 }
2713 
2714 float32 uint32_to_float32(uint32_t a, float_status *status)
2715 {
2716     return uint64_to_float32_scalbn(a, 0, status);
2717 }
2718 
2719 float32 uint16_to_float32(uint16_t a, float_status *status)
2720 {
2721     return uint64_to_float32_scalbn(a, 0, status);
2722 }
2723 
2724 float64 uint64_to_float64_scalbn(uint64_t a, int scale, float_status *status)
2725 {
2726     FloatParts pa = uint_to_float(a, scale, status);
2727     return float64_round_pack_canonical(pa, status);
2728 }
2729 
2730 float64 uint32_to_float64_scalbn(uint32_t a, int scale, float_status *status)
2731 {
2732     return uint64_to_float64_scalbn(a, scale, status);
2733 }
2734 
2735 float64 uint16_to_float64_scalbn(uint16_t a, int scale, float_status *status)
2736 {
2737     return uint64_to_float64_scalbn(a, scale, status);
2738 }
2739 
2740 float64 uint64_to_float64(uint64_t a, float_status *status)
2741 {
2742     return uint64_to_float64_scalbn(a, 0, status);
2743 }
2744 
2745 float64 uint32_to_float64(uint32_t a, float_status *status)
2746 {
2747     return uint64_to_float64_scalbn(a, 0, status);
2748 }
2749 
2750 float64 uint16_to_float64(uint16_t a, float_status *status)
2751 {
2752     return uint64_to_float64_scalbn(a, 0, status);
2753 }
2754 
2755 /* Float Min/Max */
2756 /* min() and max() functions. These can't be implemented as
2757  * 'compare and pick one input' because that would mishandle
2758  * NaNs and +0 vs -0.
2759  *
2760  * minnum() and maxnum() functions. These are similar to the min()
2761  * and max() functions but if one of the arguments is a QNaN and
2762  * the other is numerical then the numerical argument is returned.
2763  * SNaNs will get quietened before being returned.
2764  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
2765  * and maxNum() operations. min() and max() are the typical min/max
2766  * semantics provided by many CPUs which predate that specification.
2767  *
2768  * minnummag() and maxnummag() functions correspond to minNumMag()
2769  * and minNumMag() from the IEEE-754 2008.
2770  */
2771 static FloatParts minmax_floats(FloatParts a, FloatParts b, bool ismin,
2772                                 bool ieee, bool ismag, float_status *s)
2773 {
2774     if (unlikely(is_nan(a.cls) || is_nan(b.cls))) {
2775         if (ieee) {
2776             /* Takes two floating-point values `a' and `b', one of
2777              * which is a NaN, and returns the appropriate NaN
2778              * result. If either `a' or `b' is a signaling NaN,
2779              * the invalid exception is raised.
2780              */
2781             if (is_snan(a.cls) || is_snan(b.cls)) {
2782                 return pick_nan(a, b, s);
2783             } else if (is_nan(a.cls) && !is_nan(b.cls)) {
2784                 return b;
2785             } else if (is_nan(b.cls) && !is_nan(a.cls)) {
2786                 return a;
2787             }
2788         }
2789         return pick_nan(a, b, s);
2790     } else {
2791         int a_exp, b_exp;
2792 
2793         switch (a.cls) {
2794         case float_class_normal:
2795             a_exp = a.exp;
2796             break;
2797         case float_class_inf:
2798             a_exp = INT_MAX;
2799             break;
2800         case float_class_zero:
2801             a_exp = INT_MIN;
2802             break;
2803         default:
2804             g_assert_not_reached();
2805             break;
2806         }
2807         switch (b.cls) {
2808         case float_class_normal:
2809             b_exp = b.exp;
2810             break;
2811         case float_class_inf:
2812             b_exp = INT_MAX;
2813             break;
2814         case float_class_zero:
2815             b_exp = INT_MIN;
2816             break;
2817         default:
2818             g_assert_not_reached();
2819             break;
2820         }
2821 
2822         if (ismag && (a_exp != b_exp || a.frac != b.frac)) {
2823             bool a_less = a_exp < b_exp;
2824             if (a_exp == b_exp) {
2825                 a_less = a.frac < b.frac;
2826             }
2827             return a_less ^ ismin ? b : a;
2828         }
2829 
2830         if (a.sign == b.sign) {
2831             bool a_less = a_exp < b_exp;
2832             if (a_exp == b_exp) {
2833                 a_less = a.frac < b.frac;
2834             }
2835             return a.sign ^ a_less ^ ismin ? b : a;
2836         } else {
2837             return a.sign ^ ismin ? b : a;
2838         }
2839     }
2840 }
2841 
2842 #define MINMAX(sz, name, ismin, isiee, ismag)                           \
2843 float ## sz float ## sz ## _ ## name(float ## sz a, float ## sz b,      \
2844                                      float_status *s)                   \
2845 {                                                                       \
2846     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2847     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2848     FloatParts pr = minmax_floats(pa, pb, ismin, isiee, ismag, s);      \
2849                                                                         \
2850     return float ## sz ## _round_pack_canonical(pr, s);                 \
2851 }
2852 
2853 MINMAX(16, min, true, false, false)
2854 MINMAX(16, minnum, true, true, false)
2855 MINMAX(16, minnummag, true, true, true)
2856 MINMAX(16, max, false, false, false)
2857 MINMAX(16, maxnum, false, true, false)
2858 MINMAX(16, maxnummag, false, true, true)
2859 
2860 MINMAX(32, min, true, false, false)
2861 MINMAX(32, minnum, true, true, false)
2862 MINMAX(32, minnummag, true, true, true)
2863 MINMAX(32, max, false, false, false)
2864 MINMAX(32, maxnum, false, true, false)
2865 MINMAX(32, maxnummag, false, true, true)
2866 
2867 MINMAX(64, min, true, false, false)
2868 MINMAX(64, minnum, true, true, false)
2869 MINMAX(64, minnummag, true, true, true)
2870 MINMAX(64, max, false, false, false)
2871 MINMAX(64, maxnum, false, true, false)
2872 MINMAX(64, maxnummag, false, true, true)
2873 
2874 #undef MINMAX
2875 
2876 /* Floating point compare */
2877 static int compare_floats(FloatParts a, FloatParts b, bool is_quiet,
2878                           float_status *s)
2879 {
2880     if (is_nan(a.cls) || is_nan(b.cls)) {
2881         if (!is_quiet ||
2882             a.cls == float_class_snan ||
2883             b.cls == float_class_snan) {
2884             s->float_exception_flags |= float_flag_invalid;
2885         }
2886         return float_relation_unordered;
2887     }
2888 
2889     if (a.cls == float_class_zero) {
2890         if (b.cls == float_class_zero) {
2891             return float_relation_equal;
2892         }
2893         return b.sign ? float_relation_greater : float_relation_less;
2894     } else if (b.cls == float_class_zero) {
2895         return a.sign ? float_relation_less : float_relation_greater;
2896     }
2897 
2898     /* The only really important thing about infinity is its sign. If
2899      * both are infinities the sign marks the smallest of the two.
2900      */
2901     if (a.cls == float_class_inf) {
2902         if ((b.cls == float_class_inf) && (a.sign == b.sign)) {
2903             return float_relation_equal;
2904         }
2905         return a.sign ? float_relation_less : float_relation_greater;
2906     } else if (b.cls == float_class_inf) {
2907         return b.sign ? float_relation_greater : float_relation_less;
2908     }
2909 
2910     if (a.sign != b.sign) {
2911         return a.sign ? float_relation_less : float_relation_greater;
2912     }
2913 
2914     if (a.exp == b.exp) {
2915         if (a.frac == b.frac) {
2916             return float_relation_equal;
2917         }
2918         if (a.sign) {
2919             return a.frac > b.frac ?
2920                 float_relation_less : float_relation_greater;
2921         } else {
2922             return a.frac > b.frac ?
2923                 float_relation_greater : float_relation_less;
2924         }
2925     } else {
2926         if (a.sign) {
2927             return a.exp > b.exp ? float_relation_less : float_relation_greater;
2928         } else {
2929             return a.exp > b.exp ? float_relation_greater : float_relation_less;
2930         }
2931     }
2932 }
2933 
2934 #define COMPARE(name, attr, sz)                                         \
2935 static int attr                                                         \
2936 name(float ## sz a, float ## sz b, bool is_quiet, float_status *s)      \
2937 {                                                                       \
2938     FloatParts pa = float ## sz ## _unpack_canonical(a, s);             \
2939     FloatParts pb = float ## sz ## _unpack_canonical(b, s);             \
2940     return compare_floats(pa, pb, is_quiet, s);                         \
2941 }
2942 
2943 COMPARE(soft_f16_compare, QEMU_FLATTEN, 16)
2944 COMPARE(soft_f32_compare, QEMU_SOFTFLOAT_ATTR, 32)
2945 COMPARE(soft_f64_compare, QEMU_SOFTFLOAT_ATTR, 64)
2946 
2947 #undef COMPARE
2948 
2949 int float16_compare(float16 a, float16 b, float_status *s)
2950 {
2951     return soft_f16_compare(a, b, false, s);
2952 }
2953 
2954 int float16_compare_quiet(float16 a, float16 b, float_status *s)
2955 {
2956     return soft_f16_compare(a, b, true, s);
2957 }
2958 
2959 static int QEMU_FLATTEN
2960 f32_compare(float32 xa, float32 xb, bool is_quiet, float_status *s)
2961 {
2962     union_float32 ua, ub;
2963 
2964     ua.s = xa;
2965     ub.s = xb;
2966 
2967     if (QEMU_NO_HARDFLOAT) {
2968         goto soft;
2969     }
2970 
2971     float32_input_flush2(&ua.s, &ub.s, s);
2972     if (isgreaterequal(ua.h, ub.h)) {
2973         if (isgreater(ua.h, ub.h)) {
2974             return float_relation_greater;
2975         }
2976         return float_relation_equal;
2977     }
2978     if (likely(isless(ua.h, ub.h))) {
2979         return float_relation_less;
2980     }
2981     /* The only condition remaining is unordered.
2982      * Fall through to set flags.
2983      */
2984  soft:
2985     return soft_f32_compare(ua.s, ub.s, is_quiet, s);
2986 }
2987 
2988 int float32_compare(float32 a, float32 b, float_status *s)
2989 {
2990     return f32_compare(a, b, false, s);
2991 }
2992 
2993 int float32_compare_quiet(float32 a, float32 b, float_status *s)
2994 {
2995     return f32_compare(a, b, true, s);
2996 }
2997 
2998 static int QEMU_FLATTEN
2999 f64_compare(float64 xa, float64 xb, bool is_quiet, float_status *s)
3000 {
3001     union_float64 ua, ub;
3002 
3003     ua.s = xa;
3004     ub.s = xb;
3005 
3006     if (QEMU_NO_HARDFLOAT) {
3007         goto soft;
3008     }
3009 
3010     float64_input_flush2(&ua.s, &ub.s, s);
3011     if (isgreaterequal(ua.h, ub.h)) {
3012         if (isgreater(ua.h, ub.h)) {
3013             return float_relation_greater;
3014         }
3015         return float_relation_equal;
3016     }
3017     if (likely(isless(ua.h, ub.h))) {
3018         return float_relation_less;
3019     }
3020     /* The only condition remaining is unordered.
3021      * Fall through to set flags.
3022      */
3023  soft:
3024     return soft_f64_compare(ua.s, ub.s, is_quiet, s);
3025 }
3026 
3027 int float64_compare(float64 a, float64 b, float_status *s)
3028 {
3029     return f64_compare(a, b, false, s);
3030 }
3031 
3032 int float64_compare_quiet(float64 a, float64 b, float_status *s)
3033 {
3034     return f64_compare(a, b, true, s);
3035 }
3036 
3037 /* Multiply A by 2 raised to the power N.  */
3038 static FloatParts scalbn_decomposed(FloatParts a, int n, float_status *s)
3039 {
3040     if (unlikely(is_nan(a.cls))) {
3041         return return_nan(a, s);
3042     }
3043     if (a.cls == float_class_normal) {
3044         /* The largest float type (even though not supported by FloatParts)
3045          * is float128, which has a 15 bit exponent.  Bounding N to 16 bits
3046          * still allows rounding to infinity, without allowing overflow
3047          * within the int32_t that backs FloatParts.exp.
3048          */
3049         n = MIN(MAX(n, -0x10000), 0x10000);
3050         a.exp += n;
3051     }
3052     return a;
3053 }
3054 
3055 float16 float16_scalbn(float16 a, int n, float_status *status)
3056 {
3057     FloatParts pa = float16_unpack_canonical(a, status);
3058     FloatParts pr = scalbn_decomposed(pa, n, status);
3059     return float16_round_pack_canonical(pr, status);
3060 }
3061 
3062 float32 float32_scalbn(float32 a, int n, float_status *status)
3063 {
3064     FloatParts pa = float32_unpack_canonical(a, status);
3065     FloatParts pr = scalbn_decomposed(pa, n, status);
3066     return float32_round_pack_canonical(pr, status);
3067 }
3068 
3069 float64 float64_scalbn(float64 a, int n, float_status *status)
3070 {
3071     FloatParts pa = float64_unpack_canonical(a, status);
3072     FloatParts pr = scalbn_decomposed(pa, n, status);
3073     return float64_round_pack_canonical(pr, status);
3074 }
3075 
3076 /*
3077  * Square Root
3078  *
3079  * The old softfloat code did an approximation step before zeroing in
3080  * on the final result. However for simpleness we just compute the
3081  * square root by iterating down from the implicit bit to enough extra
3082  * bits to ensure we get a correctly rounded result.
3083  *
3084  * This does mean however the calculation is slower than before,
3085  * especially for 64 bit floats.
3086  */
3087 
3088 static FloatParts sqrt_float(FloatParts a, float_status *s, const FloatFmt *p)
3089 {
3090     uint64_t a_frac, r_frac, s_frac;
3091     int bit, last_bit;
3092 
3093     if (is_nan(a.cls)) {
3094         return return_nan(a, s);
3095     }
3096     if (a.cls == float_class_zero) {
3097         return a;  /* sqrt(+-0) = +-0 */
3098     }
3099     if (a.sign) {
3100         s->float_exception_flags |= float_flag_invalid;
3101         return parts_default_nan(s);
3102     }
3103     if (a.cls == float_class_inf) {
3104         return a;  /* sqrt(+inf) = +inf */
3105     }
3106 
3107     assert(a.cls == float_class_normal);
3108 
3109     /* We need two overflow bits at the top. Adding room for that is a
3110      * right shift. If the exponent is odd, we can discard the low bit
3111      * by multiplying the fraction by 2; that's a left shift. Combine
3112      * those and we shift right if the exponent is even.
3113      */
3114     a_frac = a.frac;
3115     if (!(a.exp & 1)) {
3116         a_frac >>= 1;
3117     }
3118     a.exp >>= 1;
3119 
3120     /* Bit-by-bit computation of sqrt.  */
3121     r_frac = 0;
3122     s_frac = 0;
3123 
3124     /* Iterate from implicit bit down to the 3 extra bits to compute a
3125      * properly rounded result. Remember we've inserted one more bit
3126      * at the top, so these positions are one less.
3127      */
3128     bit = DECOMPOSED_BINARY_POINT - 1;
3129     last_bit = MAX(p->frac_shift - 4, 0);
3130     do {
3131         uint64_t q = 1ULL << bit;
3132         uint64_t t_frac = s_frac + q;
3133         if (t_frac <= a_frac) {
3134             s_frac = t_frac + q;
3135             a_frac -= t_frac;
3136             r_frac += q;
3137         }
3138         a_frac <<= 1;
3139     } while (--bit >= last_bit);
3140 
3141     /* Undo the right shift done above. If there is any remaining
3142      * fraction, the result is inexact. Set the sticky bit.
3143      */
3144     a.frac = (r_frac << 1) + (a_frac != 0);
3145 
3146     return a;
3147 }
3148 
3149 float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)
3150 {
3151     FloatParts pa = float16_unpack_canonical(a, status);
3152     FloatParts pr = sqrt_float(pa, status, &float16_params);
3153     return float16_round_pack_canonical(pr, status);
3154 }
3155 
3156 static float32 QEMU_SOFTFLOAT_ATTR
3157 soft_f32_sqrt(float32 a, float_status *status)
3158 {
3159     FloatParts pa = float32_unpack_canonical(a, status);
3160     FloatParts pr = sqrt_float(pa, status, &float32_params);
3161     return float32_round_pack_canonical(pr, status);
3162 }
3163 
3164 static float64 QEMU_SOFTFLOAT_ATTR
3165 soft_f64_sqrt(float64 a, float_status *status)
3166 {
3167     FloatParts pa = float64_unpack_canonical(a, status);
3168     FloatParts pr = sqrt_float(pa, status, &float64_params);
3169     return float64_round_pack_canonical(pr, status);
3170 }
3171 
3172 float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)
3173 {
3174     union_float32 ua, ur;
3175 
3176     ua.s = xa;
3177     if (unlikely(!can_use_fpu(s))) {
3178         goto soft;
3179     }
3180 
3181     float32_input_flush1(&ua.s, s);
3182     if (QEMU_HARDFLOAT_1F32_USE_FP) {
3183         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3184                        fpclassify(ua.h) == FP_ZERO) ||
3185                      signbit(ua.h))) {
3186             goto soft;
3187         }
3188     } else if (unlikely(!float32_is_zero_or_normal(ua.s) ||
3189                         float32_is_neg(ua.s))) {
3190         goto soft;
3191     }
3192     ur.h = sqrtf(ua.h);
3193     return ur.s;
3194 
3195  soft:
3196     return soft_f32_sqrt(ua.s, s);
3197 }
3198 
3199 float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)
3200 {
3201     union_float64 ua, ur;
3202 
3203     ua.s = xa;
3204     if (unlikely(!can_use_fpu(s))) {
3205         goto soft;
3206     }
3207 
3208     float64_input_flush1(&ua.s, s);
3209     if (QEMU_HARDFLOAT_1F64_USE_FP) {
3210         if (unlikely(!(fpclassify(ua.h) == FP_NORMAL ||
3211                        fpclassify(ua.h) == FP_ZERO) ||
3212                      signbit(ua.h))) {
3213             goto soft;
3214         }
3215     } else if (unlikely(!float64_is_zero_or_normal(ua.s) ||
3216                         float64_is_neg(ua.s))) {
3217         goto soft;
3218     }
3219     ur.h = sqrt(ua.h);
3220     return ur.s;
3221 
3222  soft:
3223     return soft_f64_sqrt(ua.s, s);
3224 }
3225 
3226 /*----------------------------------------------------------------------------
3227 | The pattern for a default generated NaN.
3228 *----------------------------------------------------------------------------*/
3229 
3230 float16 float16_default_nan(float_status *status)
3231 {
3232     FloatParts p = parts_default_nan(status);
3233     p.frac >>= float16_params.frac_shift;
3234     return float16_pack_raw(p);
3235 }
3236 
3237 float32 float32_default_nan(float_status *status)
3238 {
3239     FloatParts p = parts_default_nan(status);
3240     p.frac >>= float32_params.frac_shift;
3241     return float32_pack_raw(p);
3242 }
3243 
3244 float64 float64_default_nan(float_status *status)
3245 {
3246     FloatParts p = parts_default_nan(status);
3247     p.frac >>= float64_params.frac_shift;
3248     return float64_pack_raw(p);
3249 }
3250 
3251 float128 float128_default_nan(float_status *status)
3252 {
3253     FloatParts p = parts_default_nan(status);
3254     float128 r;
3255 
3256     /* Extrapolate from the choices made by parts_default_nan to fill
3257      * in the quad-floating format.  If the low bit is set, assume we
3258      * want to set all non-snan bits.
3259      */
3260     r.low = -(p.frac & 1);
3261     r.high = p.frac >> (DECOMPOSED_BINARY_POINT - 48);
3262     r.high |= LIT64(0x7FFF000000000000);
3263     r.high |= (uint64_t)p.sign << 63;
3264 
3265     return r;
3266 }
3267 
3268 /*----------------------------------------------------------------------------
3269 | Returns a quiet NaN from a signalling NaN for the floating point value `a'.
3270 *----------------------------------------------------------------------------*/
3271 
3272 float16 float16_silence_nan(float16 a, float_status *status)
3273 {
3274     FloatParts p = float16_unpack_raw(a);
3275     p.frac <<= float16_params.frac_shift;
3276     p = parts_silence_nan(p, status);
3277     p.frac >>= float16_params.frac_shift;
3278     return float16_pack_raw(p);
3279 }
3280 
3281 float32 float32_silence_nan(float32 a, float_status *status)
3282 {
3283     FloatParts p = float32_unpack_raw(a);
3284     p.frac <<= float32_params.frac_shift;
3285     p = parts_silence_nan(p, status);
3286     p.frac >>= float32_params.frac_shift;
3287     return float32_pack_raw(p);
3288 }
3289 
3290 float64 float64_silence_nan(float64 a, float_status *status)
3291 {
3292     FloatParts p = float64_unpack_raw(a);
3293     p.frac <<= float64_params.frac_shift;
3294     p = parts_silence_nan(p, status);
3295     p.frac >>= float64_params.frac_shift;
3296     return float64_pack_raw(p);
3297 }
3298 
3299 /*----------------------------------------------------------------------------
3300 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
3301 | and 7, and returns the properly rounded 32-bit integer corresponding to the
3302 | input.  If `zSign' is 1, the input is negated before being converted to an
3303 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
3304 | is simply rounded to an integer, with the inexact exception raised if the
3305 | input cannot be represented exactly as an integer.  However, if the fixed-
3306 | point input is too large, the invalid exception is raised and the largest
3307 | positive or negative integer is returned.
3308 *----------------------------------------------------------------------------*/
3309 
3310 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
3311 {
3312     int8_t roundingMode;
3313     flag roundNearestEven;
3314     int8_t roundIncrement, roundBits;
3315     int32_t z;
3316 
3317     roundingMode = status->float_rounding_mode;
3318     roundNearestEven = ( roundingMode == float_round_nearest_even );
3319     switch (roundingMode) {
3320     case float_round_nearest_even:
3321     case float_round_ties_away:
3322         roundIncrement = 0x40;
3323         break;
3324     case float_round_to_zero:
3325         roundIncrement = 0;
3326         break;
3327     case float_round_up:
3328         roundIncrement = zSign ? 0 : 0x7f;
3329         break;
3330     case float_round_down:
3331         roundIncrement = zSign ? 0x7f : 0;
3332         break;
3333     case float_round_to_odd:
3334         roundIncrement = absZ & 0x80 ? 0 : 0x7f;
3335         break;
3336     default:
3337         abort();
3338     }
3339     roundBits = absZ & 0x7F;
3340     absZ = ( absZ + roundIncrement )>>7;
3341     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3342     z = absZ;
3343     if ( zSign ) z = - z;
3344     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
3345         float_raise(float_flag_invalid, status);
3346         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3347     }
3348     if (roundBits) {
3349         status->float_exception_flags |= float_flag_inexact;
3350     }
3351     return z;
3352 
3353 }
3354 
3355 /*----------------------------------------------------------------------------
3356 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3357 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3358 | and returns the properly rounded 64-bit integer corresponding to the input.
3359 | If `zSign' is 1, the input is negated before being converted to an integer.
3360 | Ordinarily, the fixed-point input is simply rounded to an integer, with
3361 | the inexact exception raised if the input cannot be represented exactly as
3362 | an integer.  However, if the fixed-point input is too large, the invalid
3363 | exception is raised and the largest positive or negative integer is
3364 | returned.
3365 *----------------------------------------------------------------------------*/
3366 
3367 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
3368                                float_status *status)
3369 {
3370     int8_t roundingMode;
3371     flag roundNearestEven, increment;
3372     int64_t z;
3373 
3374     roundingMode = status->float_rounding_mode;
3375     roundNearestEven = ( roundingMode == float_round_nearest_even );
3376     switch (roundingMode) {
3377     case float_round_nearest_even:
3378     case float_round_ties_away:
3379         increment = ((int64_t) absZ1 < 0);
3380         break;
3381     case float_round_to_zero:
3382         increment = 0;
3383         break;
3384     case float_round_up:
3385         increment = !zSign && absZ1;
3386         break;
3387     case float_round_down:
3388         increment = zSign && absZ1;
3389         break;
3390     case float_round_to_odd:
3391         increment = !(absZ0 & 1) && absZ1;
3392         break;
3393     default:
3394         abort();
3395     }
3396     if ( increment ) {
3397         ++absZ0;
3398         if ( absZ0 == 0 ) goto overflow;
3399         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
3400     }
3401     z = absZ0;
3402     if ( zSign ) z = - z;
3403     if ( z && ( ( z < 0 ) ^ zSign ) ) {
3404  overflow:
3405         float_raise(float_flag_invalid, status);
3406         return
3407               zSign ? (int64_t) LIT64( 0x8000000000000000 )
3408             : LIT64( 0x7FFFFFFFFFFFFFFF );
3409     }
3410     if (absZ1) {
3411         status->float_exception_flags |= float_flag_inexact;
3412     }
3413     return z;
3414 
3415 }
3416 
3417 /*----------------------------------------------------------------------------
3418 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
3419 | `absZ1', with binary point between bits 63 and 64 (between the input words),
3420 | and returns the properly rounded 64-bit unsigned integer corresponding to the
3421 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
3422 | with the inexact exception raised if the input cannot be represented exactly
3423 | as an integer.  However, if the fixed-point input is too large, the invalid
3424 | exception is raised and the largest unsigned integer is returned.
3425 *----------------------------------------------------------------------------*/
3426 
3427 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
3428                                 uint64_t absZ1, float_status *status)
3429 {
3430     int8_t roundingMode;
3431     flag roundNearestEven, increment;
3432 
3433     roundingMode = status->float_rounding_mode;
3434     roundNearestEven = (roundingMode == float_round_nearest_even);
3435     switch (roundingMode) {
3436     case float_round_nearest_even:
3437     case float_round_ties_away:
3438         increment = ((int64_t)absZ1 < 0);
3439         break;
3440     case float_round_to_zero:
3441         increment = 0;
3442         break;
3443     case float_round_up:
3444         increment = !zSign && absZ1;
3445         break;
3446     case float_round_down:
3447         increment = zSign && absZ1;
3448         break;
3449     case float_round_to_odd:
3450         increment = !(absZ0 & 1) && absZ1;
3451         break;
3452     default:
3453         abort();
3454     }
3455     if (increment) {
3456         ++absZ0;
3457         if (absZ0 == 0) {
3458             float_raise(float_flag_invalid, status);
3459             return LIT64(0xFFFFFFFFFFFFFFFF);
3460         }
3461         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
3462     }
3463 
3464     if (zSign && absZ0) {
3465         float_raise(float_flag_invalid, status);
3466         return 0;
3467     }
3468 
3469     if (absZ1) {
3470         status->float_exception_flags |= float_flag_inexact;
3471     }
3472     return absZ0;
3473 }
3474 
3475 /*----------------------------------------------------------------------------
3476 | If `a' is denormal and we are in flush-to-zero mode then set the
3477 | input-denormal exception and return zero. Otherwise just return the value.
3478 *----------------------------------------------------------------------------*/
3479 float32 float32_squash_input_denormal(float32 a, float_status *status)
3480 {
3481     if (status->flush_inputs_to_zero) {
3482         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
3483             float_raise(float_flag_input_denormal, status);
3484             return make_float32(float32_val(a) & 0x80000000);
3485         }
3486     }
3487     return a;
3488 }
3489 
3490 /*----------------------------------------------------------------------------
3491 | Normalizes the subnormal single-precision floating-point value represented
3492 | by the denormalized significand `aSig'.  The normalized exponent and
3493 | significand are stored at the locations pointed to by `zExpPtr' and
3494 | `zSigPtr', respectively.
3495 *----------------------------------------------------------------------------*/
3496 
3497 static void
3498  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
3499 {
3500     int8_t shiftCount;
3501 
3502     shiftCount = clz32(aSig) - 8;
3503     *zSigPtr = aSig<<shiftCount;
3504     *zExpPtr = 1 - shiftCount;
3505 
3506 }
3507 
3508 /*----------------------------------------------------------------------------
3509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3510 | and significand `zSig', and returns the proper single-precision floating-
3511 | point value corresponding to the abstract input.  Ordinarily, the abstract
3512 | value is simply rounded and packed into the single-precision format, with
3513 | the inexact exception raised if the abstract input cannot be represented
3514 | exactly.  However, if the abstract value is too large, the overflow and
3515 | inexact exceptions are raised and an infinity or maximal finite value is
3516 | returned.  If the abstract value is too small, the input value is rounded to
3517 | a subnormal number, and the underflow and inexact exceptions are raised if
3518 | the abstract input cannot be represented exactly as a subnormal single-
3519 | precision floating-point number.
3520 |     The input significand `zSig' has its binary point between bits 30
3521 | and 29, which is 7 bits to the left of the usual location.  This shifted
3522 | significand must be normalized or smaller.  If `zSig' is not normalized,
3523 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3524 | and it must not require rounding.  In the usual case that `zSig' is
3525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3526 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3527 | Binary Floating-Point Arithmetic.
3528 *----------------------------------------------------------------------------*/
3529 
3530 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3531                                    float_status *status)
3532 {
3533     int8_t roundingMode;
3534     flag roundNearestEven;
3535     int8_t roundIncrement, roundBits;
3536     flag isTiny;
3537 
3538     roundingMode = status->float_rounding_mode;
3539     roundNearestEven = ( roundingMode == float_round_nearest_even );
3540     switch (roundingMode) {
3541     case float_round_nearest_even:
3542     case float_round_ties_away:
3543         roundIncrement = 0x40;
3544         break;
3545     case float_round_to_zero:
3546         roundIncrement = 0;
3547         break;
3548     case float_round_up:
3549         roundIncrement = zSign ? 0 : 0x7f;
3550         break;
3551     case float_round_down:
3552         roundIncrement = zSign ? 0x7f : 0;
3553         break;
3554     case float_round_to_odd:
3555         roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3556         break;
3557     default:
3558         abort();
3559         break;
3560     }
3561     roundBits = zSig & 0x7F;
3562     if ( 0xFD <= (uint16_t) zExp ) {
3563         if (    ( 0xFD < zExp )
3564              || (    ( zExp == 0xFD )
3565                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
3566            ) {
3567             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3568                                    roundIncrement != 0;
3569             float_raise(float_flag_overflow | float_flag_inexact, status);
3570             return packFloat32(zSign, 0xFF, -!overflow_to_inf);
3571         }
3572         if ( zExp < 0 ) {
3573             if (status->flush_to_zero) {
3574                 float_raise(float_flag_output_denormal, status);
3575                 return packFloat32(zSign, 0, 0);
3576             }
3577             isTiny =
3578                 (status->float_detect_tininess
3579                  == float_tininess_before_rounding)
3580                 || ( zExp < -1 )
3581                 || ( zSig + roundIncrement < 0x80000000 );
3582             shift32RightJamming( zSig, - zExp, &zSig );
3583             zExp = 0;
3584             roundBits = zSig & 0x7F;
3585             if (isTiny && roundBits) {
3586                 float_raise(float_flag_underflow, status);
3587             }
3588             if (roundingMode == float_round_to_odd) {
3589                 /*
3590                  * For round-to-odd case, the roundIncrement depends on
3591                  * zSig which just changed.
3592                  */
3593                 roundIncrement = zSig & 0x80 ? 0 : 0x7f;
3594             }
3595         }
3596     }
3597     if (roundBits) {
3598         status->float_exception_flags |= float_flag_inexact;
3599     }
3600     zSig = ( zSig + roundIncrement )>>7;
3601     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
3602     if ( zSig == 0 ) zExp = 0;
3603     return packFloat32( zSign, zExp, zSig );
3604 
3605 }
3606 
3607 /*----------------------------------------------------------------------------
3608 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3609 | and significand `zSig', and returns the proper single-precision floating-
3610 | point value corresponding to the abstract input.  This routine is just like
3611 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
3612 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3613 | floating-point exponent.
3614 *----------------------------------------------------------------------------*/
3615 
3616 static float32
3617  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
3618                               float_status *status)
3619 {
3620     int8_t shiftCount;
3621 
3622     shiftCount = clz32(zSig) - 1;
3623     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
3624                                status);
3625 
3626 }
3627 
3628 /*----------------------------------------------------------------------------
3629 | If `a' is denormal and we are in flush-to-zero mode then set the
3630 | input-denormal exception and return zero. Otherwise just return the value.
3631 *----------------------------------------------------------------------------*/
3632 float64 float64_squash_input_denormal(float64 a, float_status *status)
3633 {
3634     if (status->flush_inputs_to_zero) {
3635         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
3636             float_raise(float_flag_input_denormal, status);
3637             return make_float64(float64_val(a) & (1ULL << 63));
3638         }
3639     }
3640     return a;
3641 }
3642 
3643 /*----------------------------------------------------------------------------
3644 | Normalizes the subnormal double-precision floating-point value represented
3645 | by the denormalized significand `aSig'.  The normalized exponent and
3646 | significand are stored at the locations pointed to by `zExpPtr' and
3647 | `zSigPtr', respectively.
3648 *----------------------------------------------------------------------------*/
3649 
3650 static void
3651  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
3652 {
3653     int8_t shiftCount;
3654 
3655     shiftCount = clz64(aSig) - 11;
3656     *zSigPtr = aSig<<shiftCount;
3657     *zExpPtr = 1 - shiftCount;
3658 
3659 }
3660 
3661 /*----------------------------------------------------------------------------
3662 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3663 | double-precision floating-point value, returning the result.  After being
3664 | shifted into the proper positions, the three fields are simply added
3665 | together to form the result.  This means that any integer portion of `zSig'
3666 | will be added into the exponent.  Since a properly normalized significand
3667 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3668 | than the desired result exponent whenever `zSig' is a complete, normalized
3669 | significand.
3670 *----------------------------------------------------------------------------*/
3671 
3672 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
3673 {
3674 
3675     return make_float64(
3676         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
3677 
3678 }
3679 
3680 /*----------------------------------------------------------------------------
3681 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3682 | and significand `zSig', and returns the proper double-precision floating-
3683 | point value corresponding to the abstract input.  Ordinarily, the abstract
3684 | value is simply rounded and packed into the double-precision format, with
3685 | the inexact exception raised if the abstract input cannot be represented
3686 | exactly.  However, if the abstract value is too large, the overflow and
3687 | inexact exceptions are raised and an infinity or maximal finite value is
3688 | returned.  If the abstract value is too small, the input value is rounded to
3689 | a subnormal number, and the underflow and inexact exceptions are raised if
3690 | the abstract input cannot be represented exactly as a subnormal double-
3691 | precision floating-point number.
3692 |     The input significand `zSig' has its binary point between bits 62
3693 | and 61, which is 10 bits to the left of the usual location.  This shifted
3694 | significand must be normalized or smaller.  If `zSig' is not normalized,
3695 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3696 | and it must not require rounding.  In the usual case that `zSig' is
3697 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3698 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3699 | Binary Floating-Point Arithmetic.
3700 *----------------------------------------------------------------------------*/
3701 
3702 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3703                                    float_status *status)
3704 {
3705     int8_t roundingMode;
3706     flag roundNearestEven;
3707     int roundIncrement, roundBits;
3708     flag isTiny;
3709 
3710     roundingMode = status->float_rounding_mode;
3711     roundNearestEven = ( roundingMode == float_round_nearest_even );
3712     switch (roundingMode) {
3713     case float_round_nearest_even:
3714     case float_round_ties_away:
3715         roundIncrement = 0x200;
3716         break;
3717     case float_round_to_zero:
3718         roundIncrement = 0;
3719         break;
3720     case float_round_up:
3721         roundIncrement = zSign ? 0 : 0x3ff;
3722         break;
3723     case float_round_down:
3724         roundIncrement = zSign ? 0x3ff : 0;
3725         break;
3726     case float_round_to_odd:
3727         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3728         break;
3729     default:
3730         abort();
3731     }
3732     roundBits = zSig & 0x3FF;
3733     if ( 0x7FD <= (uint16_t) zExp ) {
3734         if (    ( 0x7FD < zExp )
3735              || (    ( zExp == 0x7FD )
3736                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
3737            ) {
3738             bool overflow_to_inf = roundingMode != float_round_to_odd &&
3739                                    roundIncrement != 0;
3740             float_raise(float_flag_overflow | float_flag_inexact, status);
3741             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
3742         }
3743         if ( zExp < 0 ) {
3744             if (status->flush_to_zero) {
3745                 float_raise(float_flag_output_denormal, status);
3746                 return packFloat64(zSign, 0, 0);
3747             }
3748             isTiny =
3749                    (status->float_detect_tininess
3750                     == float_tininess_before_rounding)
3751                 || ( zExp < -1 )
3752                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
3753             shift64RightJamming( zSig, - zExp, &zSig );
3754             zExp = 0;
3755             roundBits = zSig & 0x3FF;
3756             if (isTiny && roundBits) {
3757                 float_raise(float_flag_underflow, status);
3758             }
3759             if (roundingMode == float_round_to_odd) {
3760                 /*
3761                  * For round-to-odd case, the roundIncrement depends on
3762                  * zSig which just changed.
3763                  */
3764                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
3765             }
3766         }
3767     }
3768     if (roundBits) {
3769         status->float_exception_flags |= float_flag_inexact;
3770     }
3771     zSig = ( zSig + roundIncrement )>>10;
3772     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
3773     if ( zSig == 0 ) zExp = 0;
3774     return packFloat64( zSign, zExp, zSig );
3775 
3776 }
3777 
3778 /*----------------------------------------------------------------------------
3779 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3780 | and significand `zSig', and returns the proper double-precision floating-
3781 | point value corresponding to the abstract input.  This routine is just like
3782 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
3783 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
3784 | floating-point exponent.
3785 *----------------------------------------------------------------------------*/
3786 
3787 static float64
3788  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
3789                               float_status *status)
3790 {
3791     int8_t shiftCount;
3792 
3793     shiftCount = clz64(zSig) - 1;
3794     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
3795                                status);
3796 
3797 }
3798 
3799 /*----------------------------------------------------------------------------
3800 | Normalizes the subnormal extended double-precision floating-point value
3801 | represented by the denormalized significand `aSig'.  The normalized exponent
3802 | and significand are stored at the locations pointed to by `zExpPtr' and
3803 | `zSigPtr', respectively.
3804 *----------------------------------------------------------------------------*/
3805 
3806 void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr,
3807                                 uint64_t *zSigPtr)
3808 {
3809     int8_t shiftCount;
3810 
3811     shiftCount = clz64(aSig);
3812     *zSigPtr = aSig<<shiftCount;
3813     *zExpPtr = 1 - shiftCount;
3814 }
3815 
3816 /*----------------------------------------------------------------------------
3817 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3818 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
3819 | and returns the proper extended double-precision floating-point value
3820 | corresponding to the abstract input.  Ordinarily, the abstract value is
3821 | rounded and packed into the extended double-precision format, with the
3822 | inexact exception raised if the abstract input cannot be represented
3823 | exactly.  However, if the abstract value is too large, the overflow and
3824 | inexact exceptions are raised and an infinity or maximal finite value is
3825 | returned.  If the abstract value is too small, the input value is rounded to
3826 | a subnormal number, and the underflow and inexact exceptions are raised if
3827 | the abstract input cannot be represented exactly as a subnormal extended
3828 | double-precision floating-point number.
3829 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
3830 | number of bits as single or double precision, respectively.  Otherwise, the
3831 | result is rounded to the full precision of the extended double-precision
3832 | format.
3833 |     The input significand must be normalized or smaller.  If the input
3834 | significand is not normalized, `zExp' must be 0; in that case, the result
3835 | returned is a subnormal number, and it must not require rounding.  The
3836 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
3837 | Floating-Point Arithmetic.
3838 *----------------------------------------------------------------------------*/
3839 
3840 floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
3841                               int32_t zExp, uint64_t zSig0, uint64_t zSig1,
3842                               float_status *status)
3843 {
3844     int8_t roundingMode;
3845     flag roundNearestEven, increment, isTiny;
3846     int64_t roundIncrement, roundMask, roundBits;
3847 
3848     roundingMode = status->float_rounding_mode;
3849     roundNearestEven = ( roundingMode == float_round_nearest_even );
3850     if ( roundingPrecision == 80 ) goto precision80;
3851     if ( roundingPrecision == 64 ) {
3852         roundIncrement = LIT64( 0x0000000000000400 );
3853         roundMask = LIT64( 0x00000000000007FF );
3854     }
3855     else if ( roundingPrecision == 32 ) {
3856         roundIncrement = LIT64( 0x0000008000000000 );
3857         roundMask = LIT64( 0x000000FFFFFFFFFF );
3858     }
3859     else {
3860         goto precision80;
3861     }
3862     zSig0 |= ( zSig1 != 0 );
3863     switch (roundingMode) {
3864     case float_round_nearest_even:
3865     case float_round_ties_away:
3866         break;
3867     case float_round_to_zero:
3868         roundIncrement = 0;
3869         break;
3870     case float_round_up:
3871         roundIncrement = zSign ? 0 : roundMask;
3872         break;
3873     case float_round_down:
3874         roundIncrement = zSign ? roundMask : 0;
3875         break;
3876     default:
3877         abort();
3878     }
3879     roundBits = zSig0 & roundMask;
3880     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3881         if (    ( 0x7FFE < zExp )
3882              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
3883            ) {
3884             goto overflow;
3885         }
3886         if ( zExp <= 0 ) {
3887             if (status->flush_to_zero) {
3888                 float_raise(float_flag_output_denormal, status);
3889                 return packFloatx80(zSign, 0, 0);
3890             }
3891             isTiny =
3892                    (status->float_detect_tininess
3893                     == float_tininess_before_rounding)
3894                 || ( zExp < 0 )
3895                 || ( zSig0 <= zSig0 + roundIncrement );
3896             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
3897             zExp = 0;
3898             roundBits = zSig0 & roundMask;
3899             if (isTiny && roundBits) {
3900                 float_raise(float_flag_underflow, status);
3901             }
3902             if (roundBits) {
3903                 status->float_exception_flags |= float_flag_inexact;
3904             }
3905             zSig0 += roundIncrement;
3906             if ( (int64_t) zSig0 < 0 ) zExp = 1;
3907             roundIncrement = roundMask + 1;
3908             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3909                 roundMask |= roundIncrement;
3910             }
3911             zSig0 &= ~ roundMask;
3912             return packFloatx80( zSign, zExp, zSig0 );
3913         }
3914     }
3915     if (roundBits) {
3916         status->float_exception_flags |= float_flag_inexact;
3917     }
3918     zSig0 += roundIncrement;
3919     if ( zSig0 < roundIncrement ) {
3920         ++zExp;
3921         zSig0 = LIT64( 0x8000000000000000 );
3922     }
3923     roundIncrement = roundMask + 1;
3924     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
3925         roundMask |= roundIncrement;
3926     }
3927     zSig0 &= ~ roundMask;
3928     if ( zSig0 == 0 ) zExp = 0;
3929     return packFloatx80( zSign, zExp, zSig0 );
3930  precision80:
3931     switch (roundingMode) {
3932     case float_round_nearest_even:
3933     case float_round_ties_away:
3934         increment = ((int64_t)zSig1 < 0);
3935         break;
3936     case float_round_to_zero:
3937         increment = 0;
3938         break;
3939     case float_round_up:
3940         increment = !zSign && zSig1;
3941         break;
3942     case float_round_down:
3943         increment = zSign && zSig1;
3944         break;
3945     default:
3946         abort();
3947     }
3948     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
3949         if (    ( 0x7FFE < zExp )
3950              || (    ( zExp == 0x7FFE )
3951                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
3952                   && increment
3953                 )
3954            ) {
3955             roundMask = 0;
3956  overflow:
3957             float_raise(float_flag_overflow | float_flag_inexact, status);
3958             if (    ( roundingMode == float_round_to_zero )
3959                  || ( zSign && ( roundingMode == float_round_up ) )
3960                  || ( ! zSign && ( roundingMode == float_round_down ) )
3961                ) {
3962                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
3963             }
3964             return packFloatx80(zSign,
3965                                 floatx80_infinity_high,
3966                                 floatx80_infinity_low);
3967         }
3968         if ( zExp <= 0 ) {
3969             isTiny =
3970                    (status->float_detect_tininess
3971                     == float_tininess_before_rounding)
3972                 || ( zExp < 0 )
3973                 || ! increment
3974                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
3975             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
3976             zExp = 0;
3977             if (isTiny && zSig1) {
3978                 float_raise(float_flag_underflow, status);
3979             }
3980             if (zSig1) {
3981                 status->float_exception_flags |= float_flag_inexact;
3982             }
3983             switch (roundingMode) {
3984             case float_round_nearest_even:
3985             case float_round_ties_away:
3986                 increment = ((int64_t)zSig1 < 0);
3987                 break;
3988             case float_round_to_zero:
3989                 increment = 0;
3990                 break;
3991             case float_round_up:
3992                 increment = !zSign && zSig1;
3993                 break;
3994             case float_round_down:
3995                 increment = zSign && zSig1;
3996                 break;
3997             default:
3998                 abort();
3999             }
4000             if ( increment ) {
4001                 ++zSig0;
4002                 zSig0 &=
4003                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4004                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
4005             }
4006             return packFloatx80( zSign, zExp, zSig0 );
4007         }
4008     }
4009     if (zSig1) {
4010         status->float_exception_flags |= float_flag_inexact;
4011     }
4012     if ( increment ) {
4013         ++zSig0;
4014         if ( zSig0 == 0 ) {
4015             ++zExp;
4016             zSig0 = LIT64( 0x8000000000000000 );
4017         }
4018         else {
4019             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
4020         }
4021     }
4022     else {
4023         if ( zSig0 == 0 ) zExp = 0;
4024     }
4025     return packFloatx80( zSign, zExp, zSig0 );
4026 
4027 }
4028 
4029 /*----------------------------------------------------------------------------
4030 | Takes an abstract floating-point value having sign `zSign', exponent
4031 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
4032 | and returns the proper extended double-precision floating-point value
4033 | corresponding to the abstract input.  This routine is just like
4034 | `roundAndPackFloatx80' except that the input significand does not have to be
4035 | normalized.
4036 *----------------------------------------------------------------------------*/
4037 
4038 floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
4039                                        flag zSign, int32_t zExp,
4040                                        uint64_t zSig0, uint64_t zSig1,
4041                                        float_status *status)
4042 {
4043     int8_t shiftCount;
4044 
4045     if ( zSig0 == 0 ) {
4046         zSig0 = zSig1;
4047         zSig1 = 0;
4048         zExp -= 64;
4049     }
4050     shiftCount = clz64(zSig0);
4051     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4052     zExp -= shiftCount;
4053     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
4054                                 zSig0, zSig1, status);
4055 
4056 }
4057 
4058 /*----------------------------------------------------------------------------
4059 | Returns the least-significant 64 fraction bits of the quadruple-precision
4060 | floating-point value `a'.
4061 *----------------------------------------------------------------------------*/
4062 
4063 static inline uint64_t extractFloat128Frac1( float128 a )
4064 {
4065 
4066     return a.low;
4067 
4068 }
4069 
4070 /*----------------------------------------------------------------------------
4071 | Returns the most-significant 48 fraction bits of the quadruple-precision
4072 | floating-point value `a'.
4073 *----------------------------------------------------------------------------*/
4074 
4075 static inline uint64_t extractFloat128Frac0( float128 a )
4076 {
4077 
4078     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
4079 
4080 }
4081 
4082 /*----------------------------------------------------------------------------
4083 | Returns the exponent bits of the quadruple-precision floating-point value
4084 | `a'.
4085 *----------------------------------------------------------------------------*/
4086 
4087 static inline int32_t extractFloat128Exp( float128 a )
4088 {
4089 
4090     return ( a.high>>48 ) & 0x7FFF;
4091 
4092 }
4093 
4094 /*----------------------------------------------------------------------------
4095 | Returns the sign bit of the quadruple-precision floating-point value `a'.
4096 *----------------------------------------------------------------------------*/
4097 
4098 static inline flag extractFloat128Sign( float128 a )
4099 {
4100 
4101     return a.high>>63;
4102 
4103 }
4104 
4105 /*----------------------------------------------------------------------------
4106 | Normalizes the subnormal quadruple-precision floating-point value
4107 | represented by the denormalized significand formed by the concatenation of
4108 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
4109 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
4110 | significand are stored at the location pointed to by `zSig0Ptr', and the
4111 | least significant 64 bits of the normalized significand are stored at the
4112 | location pointed to by `zSig1Ptr'.
4113 *----------------------------------------------------------------------------*/
4114 
4115 static void
4116  normalizeFloat128Subnormal(
4117      uint64_t aSig0,
4118      uint64_t aSig1,
4119      int32_t *zExpPtr,
4120      uint64_t *zSig0Ptr,
4121      uint64_t *zSig1Ptr
4122  )
4123 {
4124     int8_t shiftCount;
4125 
4126     if ( aSig0 == 0 ) {
4127         shiftCount = clz64(aSig1) - 15;
4128         if ( shiftCount < 0 ) {
4129             *zSig0Ptr = aSig1>>( - shiftCount );
4130             *zSig1Ptr = aSig1<<( shiftCount & 63 );
4131         }
4132         else {
4133             *zSig0Ptr = aSig1<<shiftCount;
4134             *zSig1Ptr = 0;
4135         }
4136         *zExpPtr = - shiftCount - 63;
4137     }
4138     else {
4139         shiftCount = clz64(aSig0) - 15;
4140         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
4141         *zExpPtr = 1 - shiftCount;
4142     }
4143 
4144 }
4145 
4146 /*----------------------------------------------------------------------------
4147 | Packs the sign `zSign', the exponent `zExp', and the significand formed
4148 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
4149 | floating-point value, returning the result.  After being shifted into the
4150 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
4151 | added together to form the most significant 32 bits of the result.  This
4152 | means that any integer portion of `zSig0' will be added into the exponent.
4153 | Since a properly normalized significand will have an integer portion equal
4154 | to 1, the `zExp' input should be 1 less than the desired result exponent
4155 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
4156 | significand.
4157 *----------------------------------------------------------------------------*/
4158 
4159 static inline float128
4160  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
4161 {
4162     float128 z;
4163 
4164     z.low = zSig1;
4165     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
4166     return z;
4167 
4168 }
4169 
4170 /*----------------------------------------------------------------------------
4171 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4172 | and extended significand formed by the concatenation of `zSig0', `zSig1',
4173 | and `zSig2', and returns the proper quadruple-precision floating-point value
4174 | corresponding to the abstract input.  Ordinarily, the abstract value is
4175 | simply rounded and packed into the quadruple-precision format, with the
4176 | inexact exception raised if the abstract input cannot be represented
4177 | exactly.  However, if the abstract value is too large, the overflow and
4178 | inexact exceptions are raised and an infinity or maximal finite value is
4179 | returned.  If the abstract value is too small, the input value is rounded to
4180 | a subnormal number, and the underflow and inexact exceptions are raised if
4181 | the abstract input cannot be represented exactly as a subnormal quadruple-
4182 | precision floating-point number.
4183 |     The input significand must be normalized or smaller.  If the input
4184 | significand is not normalized, `zExp' must be 0; in that case, the result
4185 | returned is a subnormal number, and it must not require rounding.  In the
4186 | usual case that the input significand is normalized, `zExp' must be 1 less
4187 | than the ``true'' floating-point exponent.  The handling of underflow and
4188 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4189 *----------------------------------------------------------------------------*/
4190 
4191 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
4192                                      uint64_t zSig0, uint64_t zSig1,
4193                                      uint64_t zSig2, float_status *status)
4194 {
4195     int8_t roundingMode;
4196     flag roundNearestEven, increment, isTiny;
4197 
4198     roundingMode = status->float_rounding_mode;
4199     roundNearestEven = ( roundingMode == float_round_nearest_even );
4200     switch (roundingMode) {
4201     case float_round_nearest_even:
4202     case float_round_ties_away:
4203         increment = ((int64_t)zSig2 < 0);
4204         break;
4205     case float_round_to_zero:
4206         increment = 0;
4207         break;
4208     case float_round_up:
4209         increment = !zSign && zSig2;
4210         break;
4211     case float_round_down:
4212         increment = zSign && zSig2;
4213         break;
4214     case float_round_to_odd:
4215         increment = !(zSig1 & 0x1) && zSig2;
4216         break;
4217     default:
4218         abort();
4219     }
4220     if ( 0x7FFD <= (uint32_t) zExp ) {
4221         if (    ( 0x7FFD < zExp )
4222              || (    ( zExp == 0x7FFD )
4223                   && eq128(
4224                          LIT64( 0x0001FFFFFFFFFFFF ),
4225                          LIT64( 0xFFFFFFFFFFFFFFFF ),
4226                          zSig0,
4227                          zSig1
4228                      )
4229                   && increment
4230                 )
4231            ) {
4232             float_raise(float_flag_overflow | float_flag_inexact, status);
4233             if (    ( roundingMode == float_round_to_zero )
4234                  || ( zSign && ( roundingMode == float_round_up ) )
4235                  || ( ! zSign && ( roundingMode == float_round_down ) )
4236                  || (roundingMode == float_round_to_odd)
4237                ) {
4238                 return
4239                     packFloat128(
4240                         zSign,
4241                         0x7FFE,
4242                         LIT64( 0x0000FFFFFFFFFFFF ),
4243                         LIT64( 0xFFFFFFFFFFFFFFFF )
4244                     );
4245             }
4246             return packFloat128( zSign, 0x7FFF, 0, 0 );
4247         }
4248         if ( zExp < 0 ) {
4249             if (status->flush_to_zero) {
4250                 float_raise(float_flag_output_denormal, status);
4251                 return packFloat128(zSign, 0, 0, 0);
4252             }
4253             isTiny =
4254                    (status->float_detect_tininess
4255                     == float_tininess_before_rounding)
4256                 || ( zExp < -1 )
4257                 || ! increment
4258                 || lt128(
4259                        zSig0,
4260                        zSig1,
4261                        LIT64( 0x0001FFFFFFFFFFFF ),
4262                        LIT64( 0xFFFFFFFFFFFFFFFF )
4263                    );
4264             shift128ExtraRightJamming(
4265                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
4266             zExp = 0;
4267             if (isTiny && zSig2) {
4268                 float_raise(float_flag_underflow, status);
4269             }
4270             switch (roundingMode) {
4271             case float_round_nearest_even:
4272             case float_round_ties_away:
4273                 increment = ((int64_t)zSig2 < 0);
4274                 break;
4275             case float_round_to_zero:
4276                 increment = 0;
4277                 break;
4278             case float_round_up:
4279                 increment = !zSign && zSig2;
4280                 break;
4281             case float_round_down:
4282                 increment = zSign && zSig2;
4283                 break;
4284             case float_round_to_odd:
4285                 increment = !(zSig1 & 0x1) && zSig2;
4286                 break;
4287             default:
4288                 abort();
4289             }
4290         }
4291     }
4292     if (zSig2) {
4293         status->float_exception_flags |= float_flag_inexact;
4294     }
4295     if ( increment ) {
4296         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
4297         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
4298     }
4299     else {
4300         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
4301     }
4302     return packFloat128( zSign, zExp, zSig0, zSig1 );
4303 
4304 }
4305 
4306 /*----------------------------------------------------------------------------
4307 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
4308 | and significand formed by the concatenation of `zSig0' and `zSig1', and
4309 | returns the proper quadruple-precision floating-point value corresponding
4310 | to the abstract input.  This routine is just like `roundAndPackFloat128'
4311 | except that the input significand has fewer bits and does not have to be
4312 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
4313 | point exponent.
4314 *----------------------------------------------------------------------------*/
4315 
4316 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
4317                                               uint64_t zSig0, uint64_t zSig1,
4318                                               float_status *status)
4319 {
4320     int8_t shiftCount;
4321     uint64_t zSig2;
4322 
4323     if ( zSig0 == 0 ) {
4324         zSig0 = zSig1;
4325         zSig1 = 0;
4326         zExp -= 64;
4327     }
4328     shiftCount = clz64(zSig0) - 15;
4329     if ( 0 <= shiftCount ) {
4330         zSig2 = 0;
4331         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4332     }
4333     else {
4334         shift128ExtraRightJamming(
4335             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
4336     }
4337     zExp -= shiftCount;
4338     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
4339 
4340 }
4341 
4342 
4343 /*----------------------------------------------------------------------------
4344 | Returns the result of converting the 32-bit two's complement integer `a'
4345 | to the extended double-precision floating-point format.  The conversion
4346 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4347 | Arithmetic.
4348 *----------------------------------------------------------------------------*/
4349 
4350 floatx80 int32_to_floatx80(int32_t a, float_status *status)
4351 {
4352     flag zSign;
4353     uint32_t absA;
4354     int8_t shiftCount;
4355     uint64_t zSig;
4356 
4357     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4358     zSign = ( a < 0 );
4359     absA = zSign ? - a : a;
4360     shiftCount = clz32(absA) + 32;
4361     zSig = absA;
4362     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
4363 
4364 }
4365 
4366 /*----------------------------------------------------------------------------
4367 | Returns the result of converting the 32-bit two's complement integer `a' to
4368 | the quadruple-precision floating-point format.  The conversion is performed
4369 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4370 *----------------------------------------------------------------------------*/
4371 
4372 float128 int32_to_float128(int32_t a, float_status *status)
4373 {
4374     flag zSign;
4375     uint32_t absA;
4376     int8_t shiftCount;
4377     uint64_t zSig0;
4378 
4379     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4380     zSign = ( a < 0 );
4381     absA = zSign ? - a : a;
4382     shiftCount = clz32(absA) + 17;
4383     zSig0 = absA;
4384     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
4385 
4386 }
4387 
4388 /*----------------------------------------------------------------------------
4389 | Returns the result of converting the 64-bit two's complement integer `a'
4390 | to the extended double-precision floating-point format.  The conversion
4391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4392 | Arithmetic.
4393 *----------------------------------------------------------------------------*/
4394 
4395 floatx80 int64_to_floatx80(int64_t a, float_status *status)
4396 {
4397     flag zSign;
4398     uint64_t absA;
4399     int8_t shiftCount;
4400 
4401     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
4402     zSign = ( a < 0 );
4403     absA = zSign ? - a : a;
4404     shiftCount = clz64(absA);
4405     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
4406 
4407 }
4408 
4409 /*----------------------------------------------------------------------------
4410 | Returns the result of converting the 64-bit two's complement integer `a' to
4411 | the quadruple-precision floating-point format.  The conversion is performed
4412 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4413 *----------------------------------------------------------------------------*/
4414 
4415 float128 int64_to_float128(int64_t a, float_status *status)
4416 {
4417     flag zSign;
4418     uint64_t absA;
4419     int8_t shiftCount;
4420     int32_t zExp;
4421     uint64_t zSig0, zSig1;
4422 
4423     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
4424     zSign = ( a < 0 );
4425     absA = zSign ? - a : a;
4426     shiftCount = clz64(absA) + 49;
4427     zExp = 0x406E - shiftCount;
4428     if ( 64 <= shiftCount ) {
4429         zSig1 = 0;
4430         zSig0 = absA;
4431         shiftCount -= 64;
4432     }
4433     else {
4434         zSig1 = absA;
4435         zSig0 = 0;
4436     }
4437     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
4438     return packFloat128( zSign, zExp, zSig0, zSig1 );
4439 
4440 }
4441 
4442 /*----------------------------------------------------------------------------
4443 | Returns the result of converting the 64-bit unsigned integer `a'
4444 | to the quadruple-precision floating-point format.  The conversion is performed
4445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4446 *----------------------------------------------------------------------------*/
4447 
4448 float128 uint64_to_float128(uint64_t a, float_status *status)
4449 {
4450     if (a == 0) {
4451         return float128_zero;
4452     }
4453     return normalizeRoundAndPackFloat128(0, 0x406E, 0, a, status);
4454 }
4455 
4456 /*----------------------------------------------------------------------------
4457 | Returns the result of converting the single-precision floating-point value
4458 | `a' to the extended double-precision floating-point format.  The conversion
4459 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4460 | Arithmetic.
4461 *----------------------------------------------------------------------------*/
4462 
4463 floatx80 float32_to_floatx80(float32 a, float_status *status)
4464 {
4465     flag aSign;
4466     int aExp;
4467     uint32_t aSig;
4468 
4469     a = float32_squash_input_denormal(a, status);
4470     aSig = extractFloat32Frac( a );
4471     aExp = extractFloat32Exp( a );
4472     aSign = extractFloat32Sign( a );
4473     if ( aExp == 0xFF ) {
4474         if (aSig) {
4475             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
4476         }
4477         return packFloatx80(aSign,
4478                             floatx80_infinity_high,
4479                             floatx80_infinity_low);
4480     }
4481     if ( aExp == 0 ) {
4482         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
4483         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4484     }
4485     aSig |= 0x00800000;
4486     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
4487 
4488 }
4489 
4490 /*----------------------------------------------------------------------------
4491 | Returns the result of converting the single-precision floating-point value
4492 | `a' to the double-precision floating-point format.  The conversion is
4493 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4494 | Arithmetic.
4495 *----------------------------------------------------------------------------*/
4496 
4497 float128 float32_to_float128(float32 a, float_status *status)
4498 {
4499     flag aSign;
4500     int aExp;
4501     uint32_t aSig;
4502 
4503     a = float32_squash_input_denormal(a, status);
4504     aSig = extractFloat32Frac( a );
4505     aExp = extractFloat32Exp( a );
4506     aSign = extractFloat32Sign( a );
4507     if ( aExp == 0xFF ) {
4508         if (aSig) {
4509             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
4510         }
4511         return packFloat128( aSign, 0x7FFF, 0, 0 );
4512     }
4513     if ( aExp == 0 ) {
4514         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
4515         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4516         --aExp;
4517     }
4518     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
4519 
4520 }
4521 
4522 /*----------------------------------------------------------------------------
4523 | Returns the remainder of the single-precision floating-point value `a'
4524 | with respect to the corresponding value `b'.  The operation is performed
4525 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4526 *----------------------------------------------------------------------------*/
4527 
4528 float32 float32_rem(float32 a, float32 b, float_status *status)
4529 {
4530     flag aSign, zSign;
4531     int aExp, bExp, expDiff;
4532     uint32_t aSig, bSig;
4533     uint32_t q;
4534     uint64_t aSig64, bSig64, q64;
4535     uint32_t alternateASig;
4536     int32_t sigMean;
4537     a = float32_squash_input_denormal(a, status);
4538     b = float32_squash_input_denormal(b, status);
4539 
4540     aSig = extractFloat32Frac( a );
4541     aExp = extractFloat32Exp( a );
4542     aSign = extractFloat32Sign( a );
4543     bSig = extractFloat32Frac( b );
4544     bExp = extractFloat32Exp( b );
4545     if ( aExp == 0xFF ) {
4546         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
4547             return propagateFloat32NaN(a, b, status);
4548         }
4549         float_raise(float_flag_invalid, status);
4550         return float32_default_nan(status);
4551     }
4552     if ( bExp == 0xFF ) {
4553         if (bSig) {
4554             return propagateFloat32NaN(a, b, status);
4555         }
4556         return a;
4557     }
4558     if ( bExp == 0 ) {
4559         if ( bSig == 0 ) {
4560             float_raise(float_flag_invalid, status);
4561             return float32_default_nan(status);
4562         }
4563         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
4564     }
4565     if ( aExp == 0 ) {
4566         if ( aSig == 0 ) return a;
4567         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4568     }
4569     expDiff = aExp - bExp;
4570     aSig |= 0x00800000;
4571     bSig |= 0x00800000;
4572     if ( expDiff < 32 ) {
4573         aSig <<= 8;
4574         bSig <<= 8;
4575         if ( expDiff < 0 ) {
4576             if ( expDiff < -1 ) return a;
4577             aSig >>= 1;
4578         }
4579         q = ( bSig <= aSig );
4580         if ( q ) aSig -= bSig;
4581         if ( 0 < expDiff ) {
4582             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
4583             q >>= 32 - expDiff;
4584             bSig >>= 2;
4585             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4586         }
4587         else {
4588             aSig >>= 2;
4589             bSig >>= 2;
4590         }
4591     }
4592     else {
4593         if ( bSig <= aSig ) aSig -= bSig;
4594         aSig64 = ( (uint64_t) aSig )<<40;
4595         bSig64 = ( (uint64_t) bSig )<<40;
4596         expDiff -= 64;
4597         while ( 0 < expDiff ) {
4598             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4599             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4600             aSig64 = - ( ( bSig * q64 )<<38 );
4601             expDiff -= 62;
4602         }
4603         expDiff += 64;
4604         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
4605         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
4606         q = q64>>( 64 - expDiff );
4607         bSig <<= 6;
4608         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
4609     }
4610     do {
4611         alternateASig = aSig;
4612         ++q;
4613         aSig -= bSig;
4614     } while ( 0 <= (int32_t) aSig );
4615     sigMean = aSig + alternateASig;
4616     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4617         aSig = alternateASig;
4618     }
4619     zSign = ( (int32_t) aSig < 0 );
4620     if ( zSign ) aSig = - aSig;
4621     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
4622 }
4623 
4624 
4625 
4626 /*----------------------------------------------------------------------------
4627 | Returns the binary exponential of the single-precision floating-point value
4628 | `a'. The operation is performed according to the IEC/IEEE Standard for
4629 | Binary Floating-Point Arithmetic.
4630 |
4631 | Uses the following identities:
4632 |
4633 | 1. -------------------------------------------------------------------------
4634 |      x    x*ln(2)
4635 |     2  = e
4636 |
4637 | 2. -------------------------------------------------------------------------
4638 |                      2     3     4     5           n
4639 |      x        x     x     x     x     x           x
4640 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
4641 |               1!    2!    3!    4!    5!          n!
4642 *----------------------------------------------------------------------------*/
4643 
4644 static const float64 float32_exp2_coefficients[15] =
4645 {
4646     const_float64( 0x3ff0000000000000ll ), /*  1 */
4647     const_float64( 0x3fe0000000000000ll ), /*  2 */
4648     const_float64( 0x3fc5555555555555ll ), /*  3 */
4649     const_float64( 0x3fa5555555555555ll ), /*  4 */
4650     const_float64( 0x3f81111111111111ll ), /*  5 */
4651     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
4652     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
4653     const_float64( 0x3efa01a01a01a01all ), /*  8 */
4654     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
4655     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
4656     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
4657     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
4658     const_float64( 0x3de6124613a86d09ll ), /* 13 */
4659     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
4660     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
4661 };
4662 
4663 float32 float32_exp2(float32 a, float_status *status)
4664 {
4665     flag aSign;
4666     int aExp;
4667     uint32_t aSig;
4668     float64 r, x, xn;
4669     int i;
4670     a = float32_squash_input_denormal(a, status);
4671 
4672     aSig = extractFloat32Frac( a );
4673     aExp = extractFloat32Exp( a );
4674     aSign = extractFloat32Sign( a );
4675 
4676     if ( aExp == 0xFF) {
4677         if (aSig) {
4678             return propagateFloat32NaN(a, float32_zero, status);
4679         }
4680         return (aSign) ? float32_zero : a;
4681     }
4682     if (aExp == 0) {
4683         if (aSig == 0) return float32_one;
4684     }
4685 
4686     float_raise(float_flag_inexact, status);
4687 
4688     /* ******************************* */
4689     /* using float64 for approximation */
4690     /* ******************************* */
4691     x = float32_to_float64(a, status);
4692     x = float64_mul(x, float64_ln2, status);
4693 
4694     xn = x;
4695     r = float64_one;
4696     for (i = 0 ; i < 15 ; i++) {
4697         float64 f;
4698 
4699         f = float64_mul(xn, float32_exp2_coefficients[i], status);
4700         r = float64_add(r, f, status);
4701 
4702         xn = float64_mul(xn, x, status);
4703     }
4704 
4705     return float64_to_float32(r, status);
4706 }
4707 
4708 /*----------------------------------------------------------------------------
4709 | Returns the binary log of the single-precision floating-point value `a'.
4710 | The operation is performed according to the IEC/IEEE Standard for Binary
4711 | Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4713 float32 float32_log2(float32 a, float_status *status)
4714 {
4715     flag aSign, zSign;
4716     int aExp;
4717     uint32_t aSig, zSig, i;
4718 
4719     a = float32_squash_input_denormal(a, status);
4720     aSig = extractFloat32Frac( a );
4721     aExp = extractFloat32Exp( a );
4722     aSign = extractFloat32Sign( a );
4723 
4724     if ( aExp == 0 ) {
4725         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
4726         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
4727     }
4728     if ( aSign ) {
4729         float_raise(float_flag_invalid, status);
4730         return float32_default_nan(status);
4731     }
4732     if ( aExp == 0xFF ) {
4733         if (aSig) {
4734             return propagateFloat32NaN(a, float32_zero, status);
4735         }
4736         return a;
4737     }
4738 
4739     aExp -= 0x7F;
4740     aSig |= 0x00800000;
4741     zSign = aExp < 0;
4742     zSig = aExp << 23;
4743 
4744     for (i = 1 << 22; i > 0; i >>= 1) {
4745         aSig = ( (uint64_t)aSig * aSig ) >> 23;
4746         if ( aSig & 0x01000000 ) {
4747             aSig >>= 1;
4748             zSig |= i;
4749         }
4750     }
4751 
4752     if ( zSign )
4753         zSig = -zSig;
4754 
4755     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
4756 }
4757 
4758 /*----------------------------------------------------------------------------
4759 | Returns 1 if the single-precision floating-point value `a' is equal to
4760 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4761 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4762 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763 *----------------------------------------------------------------------------*/
4764 
4765 int float32_eq(float32 a, float32 b, float_status *status)
4766 {
4767     uint32_t av, bv;
4768     a = float32_squash_input_denormal(a, status);
4769     b = float32_squash_input_denormal(b, status);
4770 
4771     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4772          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4773        ) {
4774         float_raise(float_flag_invalid, status);
4775         return 0;
4776     }
4777     av = float32_val(a);
4778     bv = float32_val(b);
4779     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4780 }
4781 
4782 /*----------------------------------------------------------------------------
4783 | Returns 1 if the single-precision floating-point value `a' is less than
4784 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
4785 | exception is raised if either operand is a NaN.  The comparison is performed
4786 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4787 *----------------------------------------------------------------------------*/
4788 
4789 int float32_le(float32 a, float32 b, float_status *status)
4790 {
4791     flag aSign, bSign;
4792     uint32_t av, bv;
4793     a = float32_squash_input_denormal(a, status);
4794     b = float32_squash_input_denormal(b, status);
4795 
4796     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4797          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4798        ) {
4799         float_raise(float_flag_invalid, status);
4800         return 0;
4801     }
4802     aSign = extractFloat32Sign( a );
4803     bSign = extractFloat32Sign( b );
4804     av = float32_val(a);
4805     bv = float32_val(b);
4806     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4807     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4808 
4809 }
4810 
4811 /*----------------------------------------------------------------------------
4812 | Returns 1 if the single-precision floating-point value `a' is less than
4813 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4814 | raised if either operand is a NaN.  The comparison is performed according
4815 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4816 *----------------------------------------------------------------------------*/
4817 
4818 int float32_lt(float32 a, float32 b, float_status *status)
4819 {
4820     flag aSign, bSign;
4821     uint32_t av, bv;
4822     a = float32_squash_input_denormal(a, status);
4823     b = float32_squash_input_denormal(b, status);
4824 
4825     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4826          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4827        ) {
4828         float_raise(float_flag_invalid, status);
4829         return 0;
4830     }
4831     aSign = extractFloat32Sign( a );
4832     bSign = extractFloat32Sign( b );
4833     av = float32_val(a);
4834     bv = float32_val(b);
4835     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4836     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4837 
4838 }
4839 
4840 /*----------------------------------------------------------------------------
4841 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4842 | be compared, and 0 otherwise.  The invalid exception is raised if either
4843 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4844 | Standard for Binary Floating-Point Arithmetic.
4845 *----------------------------------------------------------------------------*/
4846 
4847 int float32_unordered(float32 a, float32 b, float_status *status)
4848 {
4849     a = float32_squash_input_denormal(a, status);
4850     b = float32_squash_input_denormal(b, status);
4851 
4852     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4853          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4854        ) {
4855         float_raise(float_flag_invalid, status);
4856         return 1;
4857     }
4858     return 0;
4859 }
4860 
4861 /*----------------------------------------------------------------------------
4862 | Returns 1 if the single-precision floating-point value `a' is equal to
4863 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4864 | exception.  The comparison is performed according to the IEC/IEEE Standard
4865 | for Binary Floating-Point Arithmetic.
4866 *----------------------------------------------------------------------------*/
4867 
4868 int float32_eq_quiet(float32 a, float32 b, float_status *status)
4869 {
4870     a = float32_squash_input_denormal(a, status);
4871     b = float32_squash_input_denormal(b, status);
4872 
4873     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4874          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4875        ) {
4876         if (float32_is_signaling_nan(a, status)
4877          || float32_is_signaling_nan(b, status)) {
4878             float_raise(float_flag_invalid, status);
4879         }
4880         return 0;
4881     }
4882     return ( float32_val(a) == float32_val(b) ) ||
4883             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
4884 }
4885 
4886 /*----------------------------------------------------------------------------
4887 | Returns 1 if the single-precision floating-point value `a' is less than or
4888 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4889 | cause an exception.  Otherwise, the comparison is performed according to the
4890 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4891 *----------------------------------------------------------------------------*/
4892 
4893 int float32_le_quiet(float32 a, float32 b, float_status *status)
4894 {
4895     flag aSign, bSign;
4896     uint32_t av, bv;
4897     a = float32_squash_input_denormal(a, status);
4898     b = float32_squash_input_denormal(b, status);
4899 
4900     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4901          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4902        ) {
4903         if (float32_is_signaling_nan(a, status)
4904          || float32_is_signaling_nan(b, status)) {
4905             float_raise(float_flag_invalid, status);
4906         }
4907         return 0;
4908     }
4909     aSign = extractFloat32Sign( a );
4910     bSign = extractFloat32Sign( b );
4911     av = float32_val(a);
4912     bv = float32_val(b);
4913     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
4914     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4915 
4916 }
4917 
4918 /*----------------------------------------------------------------------------
4919 | Returns 1 if the single-precision floating-point value `a' is less than
4920 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4921 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4922 | Standard for Binary Floating-Point Arithmetic.
4923 *----------------------------------------------------------------------------*/
4924 
4925 int float32_lt_quiet(float32 a, float32 b, float_status *status)
4926 {
4927     flag aSign, bSign;
4928     uint32_t av, bv;
4929     a = float32_squash_input_denormal(a, status);
4930     b = float32_squash_input_denormal(b, status);
4931 
4932     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4933          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4934        ) {
4935         if (float32_is_signaling_nan(a, status)
4936          || float32_is_signaling_nan(b, status)) {
4937             float_raise(float_flag_invalid, status);
4938         }
4939         return 0;
4940     }
4941     aSign = extractFloat32Sign( a );
4942     bSign = extractFloat32Sign( b );
4943     av = float32_val(a);
4944     bv = float32_val(b);
4945     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
4946     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4947 
4948 }
4949 
4950 /*----------------------------------------------------------------------------
4951 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
4952 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4953 | comparison is performed according to the IEC/IEEE Standard for Binary
4954 | Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4956 
4957 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
4958 {
4959     a = float32_squash_input_denormal(a, status);
4960     b = float32_squash_input_denormal(b, status);
4961 
4962     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
4963          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
4964        ) {
4965         if (float32_is_signaling_nan(a, status)
4966          || float32_is_signaling_nan(b, status)) {
4967             float_raise(float_flag_invalid, status);
4968         }
4969         return 1;
4970     }
4971     return 0;
4972 }
4973 
4974 /*----------------------------------------------------------------------------
4975 | If `a' is denormal and we are in flush-to-zero mode then set the
4976 | input-denormal exception and return zero. Otherwise just return the value.
4977 *----------------------------------------------------------------------------*/
4978 float16 float16_squash_input_denormal(float16 a, float_status *status)
4979 {
4980     if (status->flush_inputs_to_zero) {
4981         if (extractFloat16Exp(a) == 0 && extractFloat16Frac(a) != 0) {
4982             float_raise(float_flag_input_denormal, status);
4983             return make_float16(float16_val(a) & 0x8000);
4984         }
4985     }
4986     return a;
4987 }
4988 
4989 /*----------------------------------------------------------------------------
4990 | Returns the result of converting the double-precision floating-point value
4991 | `a' to the extended double-precision floating-point format.  The conversion
4992 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4993 | Arithmetic.
4994 *----------------------------------------------------------------------------*/
4995 
4996 floatx80 float64_to_floatx80(float64 a, float_status *status)
4997 {
4998     flag aSign;
4999     int aExp;
5000     uint64_t aSig;
5001 
5002     a = float64_squash_input_denormal(a, status);
5003     aSig = extractFloat64Frac( a );
5004     aExp = extractFloat64Exp( a );
5005     aSign = extractFloat64Sign( a );
5006     if ( aExp == 0x7FF ) {
5007         if (aSig) {
5008             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
5009         }
5010         return packFloatx80(aSign,
5011                             floatx80_infinity_high,
5012                             floatx80_infinity_low);
5013     }
5014     if ( aExp == 0 ) {
5015         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
5016         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5017     }
5018     return
5019         packFloatx80(
5020             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
5021 
5022 }
5023 
5024 /*----------------------------------------------------------------------------
5025 | Returns the result of converting the double-precision floating-point value
5026 | `a' to the quadruple-precision floating-point format.  The conversion is
5027 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5028 | Arithmetic.
5029 *----------------------------------------------------------------------------*/
5030 
5031 float128 float64_to_float128(float64 a, float_status *status)
5032 {
5033     flag aSign;
5034     int aExp;
5035     uint64_t aSig, zSig0, zSig1;
5036 
5037     a = float64_squash_input_denormal(a, status);
5038     aSig = extractFloat64Frac( a );
5039     aExp = extractFloat64Exp( a );
5040     aSign = extractFloat64Sign( a );
5041     if ( aExp == 0x7FF ) {
5042         if (aSig) {
5043             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
5044         }
5045         return packFloat128( aSign, 0x7FFF, 0, 0 );
5046     }
5047     if ( aExp == 0 ) {
5048         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
5049         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5050         --aExp;
5051     }
5052     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
5053     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
5054 
5055 }
5056 
5057 
5058 /*----------------------------------------------------------------------------
5059 | Returns the remainder of the double-precision floating-point value `a'
5060 | with respect to the corresponding value `b'.  The operation is performed
5061 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063 
5064 float64 float64_rem(float64 a, float64 b, float_status *status)
5065 {
5066     flag aSign, zSign;
5067     int aExp, bExp, expDiff;
5068     uint64_t aSig, bSig;
5069     uint64_t q, alternateASig;
5070     int64_t sigMean;
5071 
5072     a = float64_squash_input_denormal(a, status);
5073     b = float64_squash_input_denormal(b, status);
5074     aSig = extractFloat64Frac( a );
5075     aExp = extractFloat64Exp( a );
5076     aSign = extractFloat64Sign( a );
5077     bSig = extractFloat64Frac( b );
5078     bExp = extractFloat64Exp( b );
5079     if ( aExp == 0x7FF ) {
5080         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
5081             return propagateFloat64NaN(a, b, status);
5082         }
5083         float_raise(float_flag_invalid, status);
5084         return float64_default_nan(status);
5085     }
5086     if ( bExp == 0x7FF ) {
5087         if (bSig) {
5088             return propagateFloat64NaN(a, b, status);
5089         }
5090         return a;
5091     }
5092     if ( bExp == 0 ) {
5093         if ( bSig == 0 ) {
5094             float_raise(float_flag_invalid, status);
5095             return float64_default_nan(status);
5096         }
5097         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
5098     }
5099     if ( aExp == 0 ) {
5100         if ( aSig == 0 ) return a;
5101         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5102     }
5103     expDiff = aExp - bExp;
5104     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
5105     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
5106     if ( expDiff < 0 ) {
5107         if ( expDiff < -1 ) return a;
5108         aSig >>= 1;
5109     }
5110     q = ( bSig <= aSig );
5111     if ( q ) aSig -= bSig;
5112     expDiff -= 64;
5113     while ( 0 < expDiff ) {
5114         q = estimateDiv128To64( aSig, 0, bSig );
5115         q = ( 2 < q ) ? q - 2 : 0;
5116         aSig = - ( ( bSig>>2 ) * q );
5117         expDiff -= 62;
5118     }
5119     expDiff += 64;
5120     if ( 0 < expDiff ) {
5121         q = estimateDiv128To64( aSig, 0, bSig );
5122         q = ( 2 < q ) ? q - 2 : 0;
5123         q >>= 64 - expDiff;
5124         bSig >>= 2;
5125         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
5126     }
5127     else {
5128         aSig >>= 2;
5129         bSig >>= 2;
5130     }
5131     do {
5132         alternateASig = aSig;
5133         ++q;
5134         aSig -= bSig;
5135     } while ( 0 <= (int64_t) aSig );
5136     sigMean = aSig + alternateASig;
5137     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
5138         aSig = alternateASig;
5139     }
5140     zSign = ( (int64_t) aSig < 0 );
5141     if ( zSign ) aSig = - aSig;
5142     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
5143 
5144 }
5145 
5146 /*----------------------------------------------------------------------------
5147 | Returns the binary log of the double-precision floating-point value `a'.
5148 | The operation is performed according to the IEC/IEEE Standard for Binary
5149 | Floating-Point Arithmetic.
5150 *----------------------------------------------------------------------------*/
5151 float64 float64_log2(float64 a, float_status *status)
5152 {
5153     flag aSign, zSign;
5154     int aExp;
5155     uint64_t aSig, aSig0, aSig1, zSig, i;
5156     a = float64_squash_input_denormal(a, status);
5157 
5158     aSig = extractFloat64Frac( a );
5159     aExp = extractFloat64Exp( a );
5160     aSign = extractFloat64Sign( a );
5161 
5162     if ( aExp == 0 ) {
5163         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
5164         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
5165     }
5166     if ( aSign ) {
5167         float_raise(float_flag_invalid, status);
5168         return float64_default_nan(status);
5169     }
5170     if ( aExp == 0x7FF ) {
5171         if (aSig) {
5172             return propagateFloat64NaN(a, float64_zero, status);
5173         }
5174         return a;
5175     }
5176 
5177     aExp -= 0x3FF;
5178     aSig |= LIT64( 0x0010000000000000 );
5179     zSign = aExp < 0;
5180     zSig = (uint64_t)aExp << 52;
5181     for (i = 1LL << 51; i > 0; i >>= 1) {
5182         mul64To128( aSig, aSig, &aSig0, &aSig1 );
5183         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
5184         if ( aSig & LIT64( 0x0020000000000000 ) ) {
5185             aSig >>= 1;
5186             zSig |= i;
5187         }
5188     }
5189 
5190     if ( zSign )
5191         zSig = -zSig;
5192     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
5193 }
5194 
5195 /*----------------------------------------------------------------------------
5196 | Returns 1 if the double-precision floating-point value `a' is equal to the
5197 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
5198 | if either operand is a NaN.  Otherwise, the comparison is performed
5199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5200 *----------------------------------------------------------------------------*/
5201 
5202 int float64_eq(float64 a, float64 b, float_status *status)
5203 {
5204     uint64_t av, bv;
5205     a = float64_squash_input_denormal(a, status);
5206     b = float64_squash_input_denormal(b, status);
5207 
5208     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5209          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5210        ) {
5211         float_raise(float_flag_invalid, status);
5212         return 0;
5213     }
5214     av = float64_val(a);
5215     bv = float64_val(b);
5216     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5217 
5218 }
5219 
5220 /*----------------------------------------------------------------------------
5221 | Returns 1 if the double-precision floating-point value `a' is less than or
5222 | equal to the corresponding value `b', and 0 otherwise.  The invalid
5223 | exception is raised if either operand is a NaN.  The comparison is performed
5224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5225 *----------------------------------------------------------------------------*/
5226 
5227 int float64_le(float64 a, float64 b, float_status *status)
5228 {
5229     flag aSign, bSign;
5230     uint64_t av, bv;
5231     a = float64_squash_input_denormal(a, status);
5232     b = float64_squash_input_denormal(b, status);
5233 
5234     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5235          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5236        ) {
5237         float_raise(float_flag_invalid, status);
5238         return 0;
5239     }
5240     aSign = extractFloat64Sign( a );
5241     bSign = extractFloat64Sign( b );
5242     av = float64_val(a);
5243     bv = float64_val(b);
5244     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5245     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5246 
5247 }
5248 
5249 /*----------------------------------------------------------------------------
5250 | Returns 1 if the double-precision floating-point value `a' is less than
5251 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5252 | raised if either operand is a NaN.  The comparison is performed according
5253 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5254 *----------------------------------------------------------------------------*/
5255 
5256 int float64_lt(float64 a, float64 b, float_status *status)
5257 {
5258     flag aSign, bSign;
5259     uint64_t av, bv;
5260 
5261     a = float64_squash_input_denormal(a, status);
5262     b = float64_squash_input_denormal(b, status);
5263     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5264          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5265        ) {
5266         float_raise(float_flag_invalid, status);
5267         return 0;
5268     }
5269     aSign = extractFloat64Sign( a );
5270     bSign = extractFloat64Sign( b );
5271     av = float64_val(a);
5272     bv = float64_val(b);
5273     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5274     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5275 
5276 }
5277 
5278 /*----------------------------------------------------------------------------
5279 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5280 | be compared, and 0 otherwise.  The invalid exception is raised if either
5281 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
5282 | Standard for Binary Floating-Point Arithmetic.
5283 *----------------------------------------------------------------------------*/
5284 
5285 int float64_unordered(float64 a, float64 b, float_status *status)
5286 {
5287     a = float64_squash_input_denormal(a, status);
5288     b = float64_squash_input_denormal(b, status);
5289 
5290     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5291          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5292        ) {
5293         float_raise(float_flag_invalid, status);
5294         return 1;
5295     }
5296     return 0;
5297 }
5298 
5299 /*----------------------------------------------------------------------------
5300 | Returns 1 if the double-precision floating-point value `a' is equal to the
5301 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5302 | exception.The comparison is performed according to the IEC/IEEE Standard
5303 | for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305 
5306 int float64_eq_quiet(float64 a, float64 b, float_status *status)
5307 {
5308     uint64_t av, bv;
5309     a = float64_squash_input_denormal(a, status);
5310     b = float64_squash_input_denormal(b, status);
5311 
5312     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5313          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5314        ) {
5315         if (float64_is_signaling_nan(a, status)
5316          || float64_is_signaling_nan(b, status)) {
5317             float_raise(float_flag_invalid, status);
5318         }
5319         return 0;
5320     }
5321     av = float64_val(a);
5322     bv = float64_val(b);
5323     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5324 
5325 }
5326 
5327 /*----------------------------------------------------------------------------
5328 | Returns 1 if the double-precision floating-point value `a' is less than or
5329 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5330 | cause an exception.  Otherwise, the comparison is performed according to the
5331 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5332 *----------------------------------------------------------------------------*/
5333 
5334 int float64_le_quiet(float64 a, float64 b, float_status *status)
5335 {
5336     flag aSign, bSign;
5337     uint64_t av, bv;
5338     a = float64_squash_input_denormal(a, status);
5339     b = float64_squash_input_denormal(b, status);
5340 
5341     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5342          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5343        ) {
5344         if (float64_is_signaling_nan(a, status)
5345          || float64_is_signaling_nan(b, status)) {
5346             float_raise(float_flag_invalid, status);
5347         }
5348         return 0;
5349     }
5350     aSign = extractFloat64Sign( a );
5351     bSign = extractFloat64Sign( b );
5352     av = float64_val(a);
5353     bv = float64_val(b);
5354     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
5355     return ( av == bv ) || ( aSign ^ ( av < bv ) );
5356 
5357 }
5358 
5359 /*----------------------------------------------------------------------------
5360 | Returns 1 if the double-precision floating-point value `a' is less than
5361 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5362 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5363 | Standard for Binary Floating-Point Arithmetic.
5364 *----------------------------------------------------------------------------*/
5365 
5366 int float64_lt_quiet(float64 a, float64 b, float_status *status)
5367 {
5368     flag aSign, bSign;
5369     uint64_t av, bv;
5370     a = float64_squash_input_denormal(a, status);
5371     b = float64_squash_input_denormal(b, status);
5372 
5373     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5374          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5375        ) {
5376         if (float64_is_signaling_nan(a, status)
5377          || float64_is_signaling_nan(b, status)) {
5378             float_raise(float_flag_invalid, status);
5379         }
5380         return 0;
5381     }
5382     aSign = extractFloat64Sign( a );
5383     bSign = extractFloat64Sign( b );
5384     av = float64_val(a);
5385     bv = float64_val(b);
5386     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
5387     return ( av != bv ) && ( aSign ^ ( av < bv ) );
5388 
5389 }
5390 
5391 /*----------------------------------------------------------------------------
5392 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
5393 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5394 | comparison is performed according to the IEC/IEEE Standard for Binary
5395 | Floating-Point Arithmetic.
5396 *----------------------------------------------------------------------------*/
5397 
5398 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
5399 {
5400     a = float64_squash_input_denormal(a, status);
5401     b = float64_squash_input_denormal(b, status);
5402 
5403     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
5404          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
5405        ) {
5406         if (float64_is_signaling_nan(a, status)
5407          || float64_is_signaling_nan(b, status)) {
5408             float_raise(float_flag_invalid, status);
5409         }
5410         return 1;
5411     }
5412     return 0;
5413 }
5414 
5415 /*----------------------------------------------------------------------------
5416 | Returns the result of converting the extended double-precision floating-
5417 | point value `a' to the 32-bit two's complement integer format.  The
5418 | conversion is performed according to the IEC/IEEE Standard for Binary
5419 | Floating-Point Arithmetic---which means in particular that the conversion
5420 | is rounded according to the current rounding mode.  If `a' is a NaN, the
5421 | largest positive integer is returned.  Otherwise, if the conversion
5422 | overflows, the largest integer with the same sign as `a' is returned.
5423 *----------------------------------------------------------------------------*/
5424 
5425 int32_t floatx80_to_int32(floatx80 a, float_status *status)
5426 {
5427     flag aSign;
5428     int32_t aExp, shiftCount;
5429     uint64_t aSig;
5430 
5431     if (floatx80_invalid_encoding(a)) {
5432         float_raise(float_flag_invalid, status);
5433         return 1 << 31;
5434     }
5435     aSig = extractFloatx80Frac( a );
5436     aExp = extractFloatx80Exp( a );
5437     aSign = extractFloatx80Sign( a );
5438     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5439     shiftCount = 0x4037 - aExp;
5440     if ( shiftCount <= 0 ) shiftCount = 1;
5441     shift64RightJamming( aSig, shiftCount, &aSig );
5442     return roundAndPackInt32(aSign, aSig, status);
5443 
5444 }
5445 
5446 /*----------------------------------------------------------------------------
5447 | Returns the result of converting the extended double-precision floating-
5448 | point value `a' to the 32-bit two's complement integer format.  The
5449 | conversion is performed according to the IEC/IEEE Standard for Binary
5450 | Floating-Point Arithmetic, except that the conversion is always rounded
5451 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5452 | Otherwise, if the conversion overflows, the largest integer with the same
5453 | sign as `a' is returned.
5454 *----------------------------------------------------------------------------*/
5455 
5456 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
5457 {
5458     flag aSign;
5459     int32_t aExp, shiftCount;
5460     uint64_t aSig, savedASig;
5461     int32_t z;
5462 
5463     if (floatx80_invalid_encoding(a)) {
5464         float_raise(float_flag_invalid, status);
5465         return 1 << 31;
5466     }
5467     aSig = extractFloatx80Frac( a );
5468     aExp = extractFloatx80Exp( a );
5469     aSign = extractFloatx80Sign( a );
5470     if ( 0x401E < aExp ) {
5471         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
5472         goto invalid;
5473     }
5474     else if ( aExp < 0x3FFF ) {
5475         if (aExp || aSig) {
5476             status->float_exception_flags |= float_flag_inexact;
5477         }
5478         return 0;
5479     }
5480     shiftCount = 0x403E - aExp;
5481     savedASig = aSig;
5482     aSig >>= shiftCount;
5483     z = aSig;
5484     if ( aSign ) z = - z;
5485     if ( ( z < 0 ) ^ aSign ) {
5486  invalid:
5487         float_raise(float_flag_invalid, status);
5488         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5489     }
5490     if ( ( aSig<<shiftCount ) != savedASig ) {
5491         status->float_exception_flags |= float_flag_inexact;
5492     }
5493     return z;
5494 
5495 }
5496 
5497 /*----------------------------------------------------------------------------
5498 | Returns the result of converting the extended double-precision floating-
5499 | point value `a' to the 64-bit two's complement integer format.  The
5500 | conversion is performed according to the IEC/IEEE Standard for Binary
5501 | Floating-Point Arithmetic---which means in particular that the conversion
5502 | is rounded according to the current rounding mode.  If `a' is a NaN,
5503 | the largest positive integer is returned.  Otherwise, if the conversion
5504 | overflows, the largest integer with the same sign as `a' is returned.
5505 *----------------------------------------------------------------------------*/
5506 
5507 int64_t floatx80_to_int64(floatx80 a, float_status *status)
5508 {
5509     flag aSign;
5510     int32_t aExp, shiftCount;
5511     uint64_t aSig, aSigExtra;
5512 
5513     if (floatx80_invalid_encoding(a)) {
5514         float_raise(float_flag_invalid, status);
5515         return 1ULL << 63;
5516     }
5517     aSig = extractFloatx80Frac( a );
5518     aExp = extractFloatx80Exp( a );
5519     aSign = extractFloatx80Sign( a );
5520     shiftCount = 0x403E - aExp;
5521     if ( shiftCount <= 0 ) {
5522         if ( shiftCount ) {
5523             float_raise(float_flag_invalid, status);
5524             if (!aSign || floatx80_is_any_nan(a)) {
5525                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5526             }
5527             return (int64_t) LIT64( 0x8000000000000000 );
5528         }
5529         aSigExtra = 0;
5530     }
5531     else {
5532         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
5533     }
5534     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
5535 
5536 }
5537 
5538 /*----------------------------------------------------------------------------
5539 | Returns the result of converting the extended double-precision floating-
5540 | point value `a' to the 64-bit two's complement integer format.  The
5541 | conversion is performed according to the IEC/IEEE Standard for Binary
5542 | Floating-Point Arithmetic, except that the conversion is always rounded
5543 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
5544 | Otherwise, if the conversion overflows, the largest integer with the same
5545 | sign as `a' is returned.
5546 *----------------------------------------------------------------------------*/
5547 
5548 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
5549 {
5550     flag aSign;
5551     int32_t aExp, shiftCount;
5552     uint64_t aSig;
5553     int64_t z;
5554 
5555     if (floatx80_invalid_encoding(a)) {
5556         float_raise(float_flag_invalid, status);
5557         return 1ULL << 63;
5558     }
5559     aSig = extractFloatx80Frac( a );
5560     aExp = extractFloatx80Exp( a );
5561     aSign = extractFloatx80Sign( a );
5562     shiftCount = aExp - 0x403E;
5563     if ( 0 <= shiftCount ) {
5564         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
5565         if ( ( a.high != 0xC03E ) || aSig ) {
5566             float_raise(float_flag_invalid, status);
5567             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
5568                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5569             }
5570         }
5571         return (int64_t) LIT64( 0x8000000000000000 );
5572     }
5573     else if ( aExp < 0x3FFF ) {
5574         if (aExp | aSig) {
5575             status->float_exception_flags |= float_flag_inexact;
5576         }
5577         return 0;
5578     }
5579     z = aSig>>( - shiftCount );
5580     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
5581         status->float_exception_flags |= float_flag_inexact;
5582     }
5583     if ( aSign ) z = - z;
5584     return z;
5585 
5586 }
5587 
5588 /*----------------------------------------------------------------------------
5589 | Returns the result of converting the extended double-precision floating-
5590 | point value `a' to the single-precision floating-point format.  The
5591 | conversion is performed according to the IEC/IEEE Standard for Binary
5592 | Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594 
5595 float32 floatx80_to_float32(floatx80 a, float_status *status)
5596 {
5597     flag aSign;
5598     int32_t aExp;
5599     uint64_t aSig;
5600 
5601     if (floatx80_invalid_encoding(a)) {
5602         float_raise(float_flag_invalid, status);
5603         return float32_default_nan(status);
5604     }
5605     aSig = extractFloatx80Frac( a );
5606     aExp = extractFloatx80Exp( a );
5607     aSign = extractFloatx80Sign( a );
5608     if ( aExp == 0x7FFF ) {
5609         if ( (uint64_t) ( aSig<<1 ) ) {
5610             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5611         }
5612         return packFloat32( aSign, 0xFF, 0 );
5613     }
5614     shift64RightJamming( aSig, 33, &aSig );
5615     if ( aExp || aSig ) aExp -= 0x3F81;
5616     return roundAndPackFloat32(aSign, aExp, aSig, status);
5617 
5618 }
5619 
5620 /*----------------------------------------------------------------------------
5621 | Returns the result of converting the extended double-precision floating-
5622 | point value `a' to the double-precision floating-point format.  The
5623 | conversion is performed according to the IEC/IEEE Standard for Binary
5624 | Floating-Point Arithmetic.
5625 *----------------------------------------------------------------------------*/
5626 
5627 float64 floatx80_to_float64(floatx80 a, float_status *status)
5628 {
5629     flag aSign;
5630     int32_t aExp;
5631     uint64_t aSig, zSig;
5632 
5633     if (floatx80_invalid_encoding(a)) {
5634         float_raise(float_flag_invalid, status);
5635         return float64_default_nan(status);
5636     }
5637     aSig = extractFloatx80Frac( a );
5638     aExp = extractFloatx80Exp( a );
5639     aSign = extractFloatx80Sign( a );
5640     if ( aExp == 0x7FFF ) {
5641         if ( (uint64_t) ( aSig<<1 ) ) {
5642             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5643         }
5644         return packFloat64( aSign, 0x7FF, 0 );
5645     }
5646     shift64RightJamming( aSig, 1, &zSig );
5647     if ( aExp || aSig ) aExp -= 0x3C01;
5648     return roundAndPackFloat64(aSign, aExp, zSig, status);
5649 
5650 }
5651 
5652 /*----------------------------------------------------------------------------
5653 | Returns the result of converting the extended double-precision floating-
5654 | point value `a' to the quadruple-precision floating-point format.  The
5655 | conversion is performed according to the IEC/IEEE Standard for Binary
5656 | Floating-Point Arithmetic.
5657 *----------------------------------------------------------------------------*/
5658 
5659 float128 floatx80_to_float128(floatx80 a, float_status *status)
5660 {
5661     flag aSign;
5662     int aExp;
5663     uint64_t aSig, zSig0, zSig1;
5664 
5665     if (floatx80_invalid_encoding(a)) {
5666         float_raise(float_flag_invalid, status);
5667         return float128_default_nan(status);
5668     }
5669     aSig = extractFloatx80Frac( a );
5670     aExp = extractFloatx80Exp( a );
5671     aSign = extractFloatx80Sign( a );
5672     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5673         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5674     }
5675     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5676     return packFloat128( aSign, aExp, zSig0, zSig1 );
5677 
5678 }
5679 
5680 /*----------------------------------------------------------------------------
5681 | Rounds the extended double-precision floating-point value `a'
5682 | to the precision provided by floatx80_rounding_precision and returns the
5683 | result as an extended double-precision floating-point value.
5684 | The operation is performed according to the IEC/IEEE Standard for Binary
5685 | Floating-Point Arithmetic.
5686 *----------------------------------------------------------------------------*/
5687 
5688 floatx80 floatx80_round(floatx80 a, float_status *status)
5689 {
5690     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5691                                 extractFloatx80Sign(a),
5692                                 extractFloatx80Exp(a),
5693                                 extractFloatx80Frac(a), 0, status);
5694 }
5695 
5696 /*----------------------------------------------------------------------------
5697 | Rounds the extended double-precision floating-point value `a' to an integer,
5698 | and returns the result as an extended quadruple-precision floating-point
5699 | value.  The operation is performed according to the IEC/IEEE Standard for
5700 | Binary Floating-Point Arithmetic.
5701 *----------------------------------------------------------------------------*/
5702 
5703 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5704 {
5705     flag aSign;
5706     int32_t aExp;
5707     uint64_t lastBitMask, roundBitsMask;
5708     floatx80 z;
5709 
5710     if (floatx80_invalid_encoding(a)) {
5711         float_raise(float_flag_invalid, status);
5712         return floatx80_default_nan(status);
5713     }
5714     aExp = extractFloatx80Exp( a );
5715     if ( 0x403E <= aExp ) {
5716         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5717             return propagateFloatx80NaN(a, a, status);
5718         }
5719         return a;
5720     }
5721     if ( aExp < 0x3FFF ) {
5722         if (    ( aExp == 0 )
5723              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5724             return a;
5725         }
5726         status->float_exception_flags |= float_flag_inexact;
5727         aSign = extractFloatx80Sign( a );
5728         switch (status->float_rounding_mode) {
5729          case float_round_nearest_even:
5730             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5731                ) {
5732                 return
5733                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5734             }
5735             break;
5736         case float_round_ties_away:
5737             if (aExp == 0x3FFE) {
5738                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5739             }
5740             break;
5741          case float_round_down:
5742             return
5743                   aSign ?
5744                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5745                 : packFloatx80( 0, 0, 0 );
5746          case float_round_up:
5747             return
5748                   aSign ? packFloatx80( 1, 0, 0 )
5749                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5750         }
5751         return packFloatx80( aSign, 0, 0 );
5752     }
5753     lastBitMask = 1;
5754     lastBitMask <<= 0x403E - aExp;
5755     roundBitsMask = lastBitMask - 1;
5756     z = a;
5757     switch (status->float_rounding_mode) {
5758     case float_round_nearest_even:
5759         z.low += lastBitMask>>1;
5760         if ((z.low & roundBitsMask) == 0) {
5761             z.low &= ~lastBitMask;
5762         }
5763         break;
5764     case float_round_ties_away:
5765         z.low += lastBitMask >> 1;
5766         break;
5767     case float_round_to_zero:
5768         break;
5769     case float_round_up:
5770         if (!extractFloatx80Sign(z)) {
5771             z.low += roundBitsMask;
5772         }
5773         break;
5774     case float_round_down:
5775         if (extractFloatx80Sign(z)) {
5776             z.low += roundBitsMask;
5777         }
5778         break;
5779     default:
5780         abort();
5781     }
5782     z.low &= ~ roundBitsMask;
5783     if ( z.low == 0 ) {
5784         ++z.high;
5785         z.low = LIT64( 0x8000000000000000 );
5786     }
5787     if (z.low != a.low) {
5788         status->float_exception_flags |= float_flag_inexact;
5789     }
5790     return z;
5791 
5792 }
5793 
5794 /*----------------------------------------------------------------------------
5795 | Returns the result of adding the absolute values of the extended double-
5796 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5797 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5798 | The addition is performed according to the IEC/IEEE Standard for Binary
5799 | Floating-Point Arithmetic.
5800 *----------------------------------------------------------------------------*/
5801 
5802 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5803                                 float_status *status)
5804 {
5805     int32_t aExp, bExp, zExp;
5806     uint64_t aSig, bSig, zSig0, zSig1;
5807     int32_t expDiff;
5808 
5809     aSig = extractFloatx80Frac( a );
5810     aExp = extractFloatx80Exp( a );
5811     bSig = extractFloatx80Frac( b );
5812     bExp = extractFloatx80Exp( b );
5813     expDiff = aExp - bExp;
5814     if ( 0 < expDiff ) {
5815         if ( aExp == 0x7FFF ) {
5816             if ((uint64_t)(aSig << 1)) {
5817                 return propagateFloatx80NaN(a, b, status);
5818             }
5819             return a;
5820         }
5821         if ( bExp == 0 ) --expDiff;
5822         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5823         zExp = aExp;
5824     }
5825     else if ( expDiff < 0 ) {
5826         if ( bExp == 0x7FFF ) {
5827             if ((uint64_t)(bSig << 1)) {
5828                 return propagateFloatx80NaN(a, b, status);
5829             }
5830             return packFloatx80(zSign,
5831                                 floatx80_infinity_high,
5832                                 floatx80_infinity_low);
5833         }
5834         if ( aExp == 0 ) ++expDiff;
5835         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5836         zExp = bExp;
5837     }
5838     else {
5839         if ( aExp == 0x7FFF ) {
5840             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5841                 return propagateFloatx80NaN(a, b, status);
5842             }
5843             return a;
5844         }
5845         zSig1 = 0;
5846         zSig0 = aSig + bSig;
5847         if ( aExp == 0 ) {
5848             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5849             goto roundAndPack;
5850         }
5851         zExp = aExp;
5852         goto shiftRight1;
5853     }
5854     zSig0 = aSig + bSig;
5855     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5856  shiftRight1:
5857     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5858     zSig0 |= LIT64( 0x8000000000000000 );
5859     ++zExp;
5860  roundAndPack:
5861     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5862                                 zSign, zExp, zSig0, zSig1, status);
5863 }
5864 
5865 /*----------------------------------------------------------------------------
5866 | Returns the result of subtracting the absolute values of the extended
5867 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5868 | difference is negated before being returned.  `zSign' is ignored if the
5869 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5870 | Standard for Binary Floating-Point Arithmetic.
5871 *----------------------------------------------------------------------------*/
5872 
5873 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5874                                 float_status *status)
5875 {
5876     int32_t aExp, bExp, zExp;
5877     uint64_t aSig, bSig, zSig0, zSig1;
5878     int32_t expDiff;
5879 
5880     aSig = extractFloatx80Frac( a );
5881     aExp = extractFloatx80Exp( a );
5882     bSig = extractFloatx80Frac( b );
5883     bExp = extractFloatx80Exp( b );
5884     expDiff = aExp - bExp;
5885     if ( 0 < expDiff ) goto aExpBigger;
5886     if ( expDiff < 0 ) goto bExpBigger;
5887     if ( aExp == 0x7FFF ) {
5888         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5889             return propagateFloatx80NaN(a, b, status);
5890         }
5891         float_raise(float_flag_invalid, status);
5892         return floatx80_default_nan(status);
5893     }
5894     if ( aExp == 0 ) {
5895         aExp = 1;
5896         bExp = 1;
5897     }
5898     zSig1 = 0;
5899     if ( bSig < aSig ) goto aBigger;
5900     if ( aSig < bSig ) goto bBigger;
5901     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5902  bExpBigger:
5903     if ( bExp == 0x7FFF ) {
5904         if ((uint64_t)(bSig << 1)) {
5905             return propagateFloatx80NaN(a, b, status);
5906         }
5907         return packFloatx80(zSign ^ 1, floatx80_infinity_high,
5908                             floatx80_infinity_low);
5909     }
5910     if ( aExp == 0 ) ++expDiff;
5911     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5912  bBigger:
5913     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5914     zExp = bExp;
5915     zSign ^= 1;
5916     goto normalizeRoundAndPack;
5917  aExpBigger:
5918     if ( aExp == 0x7FFF ) {
5919         if ((uint64_t)(aSig << 1)) {
5920             return propagateFloatx80NaN(a, b, status);
5921         }
5922         return a;
5923     }
5924     if ( bExp == 0 ) --expDiff;
5925     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5926  aBigger:
5927     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5928     zExp = aExp;
5929  normalizeRoundAndPack:
5930     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5931                                          zSign, zExp, zSig0, zSig1, status);
5932 }
5933 
5934 /*----------------------------------------------------------------------------
5935 | Returns the result of adding the extended double-precision floating-point
5936 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5937 | Standard for Binary Floating-Point Arithmetic.
5938 *----------------------------------------------------------------------------*/
5939 
5940 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5941 {
5942     flag aSign, bSign;
5943 
5944     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5945         float_raise(float_flag_invalid, status);
5946         return floatx80_default_nan(status);
5947     }
5948     aSign = extractFloatx80Sign( a );
5949     bSign = extractFloatx80Sign( b );
5950     if ( aSign == bSign ) {
5951         return addFloatx80Sigs(a, b, aSign, status);
5952     }
5953     else {
5954         return subFloatx80Sigs(a, b, aSign, status);
5955     }
5956 
5957 }
5958 
5959 /*----------------------------------------------------------------------------
5960 | Returns the result of subtracting the extended double-precision floating-
5961 | point values `a' and `b'.  The operation is performed according to the
5962 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5963 *----------------------------------------------------------------------------*/
5964 
5965 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5966 {
5967     flag aSign, bSign;
5968 
5969     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5970         float_raise(float_flag_invalid, status);
5971         return floatx80_default_nan(status);
5972     }
5973     aSign = extractFloatx80Sign( a );
5974     bSign = extractFloatx80Sign( b );
5975     if ( aSign == bSign ) {
5976         return subFloatx80Sigs(a, b, aSign, status);
5977     }
5978     else {
5979         return addFloatx80Sigs(a, b, aSign, status);
5980     }
5981 
5982 }
5983 
5984 /*----------------------------------------------------------------------------
5985 | Returns the result of multiplying the extended double-precision floating-
5986 | point values `a' and `b'.  The operation is performed according to the
5987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5988 *----------------------------------------------------------------------------*/
5989 
5990 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5991 {
5992     flag aSign, bSign, zSign;
5993     int32_t aExp, bExp, zExp;
5994     uint64_t aSig, bSig, zSig0, zSig1;
5995 
5996     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5997         float_raise(float_flag_invalid, status);
5998         return floatx80_default_nan(status);
5999     }
6000     aSig = extractFloatx80Frac( a );
6001     aExp = extractFloatx80Exp( a );
6002     aSign = extractFloatx80Sign( a );
6003     bSig = extractFloatx80Frac( b );
6004     bExp = extractFloatx80Exp( b );
6005     bSign = extractFloatx80Sign( b );
6006     zSign = aSign ^ bSign;
6007     if ( aExp == 0x7FFF ) {
6008         if (    (uint64_t) ( aSig<<1 )
6009              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6010             return propagateFloatx80NaN(a, b, status);
6011         }
6012         if ( ( bExp | bSig ) == 0 ) goto invalid;
6013         return packFloatx80(zSign, floatx80_infinity_high,
6014                                    floatx80_infinity_low);
6015     }
6016     if ( bExp == 0x7FFF ) {
6017         if ((uint64_t)(bSig << 1)) {
6018             return propagateFloatx80NaN(a, b, status);
6019         }
6020         if ( ( aExp | aSig ) == 0 ) {
6021  invalid:
6022             float_raise(float_flag_invalid, status);
6023             return floatx80_default_nan(status);
6024         }
6025         return packFloatx80(zSign, floatx80_infinity_high,
6026                                    floatx80_infinity_low);
6027     }
6028     if ( aExp == 0 ) {
6029         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6030         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6031     }
6032     if ( bExp == 0 ) {
6033         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
6034         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6035     }
6036     zExp = aExp + bExp - 0x3FFE;
6037     mul64To128( aSig, bSig, &zSig0, &zSig1 );
6038     if ( 0 < (int64_t) zSig0 ) {
6039         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
6040         --zExp;
6041     }
6042     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6043                                 zSign, zExp, zSig0, zSig1, status);
6044 }
6045 
6046 /*----------------------------------------------------------------------------
6047 | Returns the result of dividing the extended double-precision floating-point
6048 | value `a' by the corresponding value `b'.  The operation is performed
6049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6050 *----------------------------------------------------------------------------*/
6051 
6052 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
6053 {
6054     flag aSign, bSign, zSign;
6055     int32_t aExp, bExp, zExp;
6056     uint64_t aSig, bSig, zSig0, zSig1;
6057     uint64_t rem0, rem1, rem2, term0, term1, term2;
6058 
6059     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6060         float_raise(float_flag_invalid, status);
6061         return floatx80_default_nan(status);
6062     }
6063     aSig = extractFloatx80Frac( a );
6064     aExp = extractFloatx80Exp( a );
6065     aSign = extractFloatx80Sign( a );
6066     bSig = extractFloatx80Frac( b );
6067     bExp = extractFloatx80Exp( b );
6068     bSign = extractFloatx80Sign( b );
6069     zSign = aSign ^ bSign;
6070     if ( aExp == 0x7FFF ) {
6071         if ((uint64_t)(aSig << 1)) {
6072             return propagateFloatx80NaN(a, b, status);
6073         }
6074         if ( bExp == 0x7FFF ) {
6075             if ((uint64_t)(bSig << 1)) {
6076                 return propagateFloatx80NaN(a, b, status);
6077             }
6078             goto invalid;
6079         }
6080         return packFloatx80(zSign, floatx80_infinity_high,
6081                                    floatx80_infinity_low);
6082     }
6083     if ( bExp == 0x7FFF ) {
6084         if ((uint64_t)(bSig << 1)) {
6085             return propagateFloatx80NaN(a, b, status);
6086         }
6087         return packFloatx80( zSign, 0, 0 );
6088     }
6089     if ( bExp == 0 ) {
6090         if ( bSig == 0 ) {
6091             if ( ( aExp | aSig ) == 0 ) {
6092  invalid:
6093                 float_raise(float_flag_invalid, status);
6094                 return floatx80_default_nan(status);
6095             }
6096             float_raise(float_flag_divbyzero, status);
6097             return packFloatx80(zSign, floatx80_infinity_high,
6098                                        floatx80_infinity_low);
6099         }
6100         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6101     }
6102     if ( aExp == 0 ) {
6103         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
6104         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
6105     }
6106     zExp = aExp - bExp + 0x3FFE;
6107     rem1 = 0;
6108     if ( bSig <= aSig ) {
6109         shift128Right( aSig, 0, 1, &aSig, &rem1 );
6110         ++zExp;
6111     }
6112     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
6113     mul64To128( bSig, zSig0, &term0, &term1 );
6114     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
6115     while ( (int64_t) rem0 < 0 ) {
6116         --zSig0;
6117         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
6118     }
6119     zSig1 = estimateDiv128To64( rem1, 0, bSig );
6120     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
6121         mul64To128( bSig, zSig1, &term1, &term2 );
6122         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6123         while ( (int64_t) rem1 < 0 ) {
6124             --zSig1;
6125             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
6126         }
6127         zSig1 |= ( ( rem1 | rem2 ) != 0 );
6128     }
6129     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6130                                 zSign, zExp, zSig0, zSig1, status);
6131 }
6132 
6133 /*----------------------------------------------------------------------------
6134 | Returns the remainder of the extended double-precision floating-point value
6135 | `a' with respect to the corresponding value `b'.  The operation is performed
6136 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6137 *----------------------------------------------------------------------------*/
6138 
6139 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
6140 {
6141     flag aSign, zSign;
6142     int32_t aExp, bExp, expDiff;
6143     uint64_t aSig0, aSig1, bSig;
6144     uint64_t q, term0, term1, alternateASig0, alternateASig1;
6145 
6146     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6147         float_raise(float_flag_invalid, status);
6148         return floatx80_default_nan(status);
6149     }
6150     aSig0 = extractFloatx80Frac( a );
6151     aExp = extractFloatx80Exp( a );
6152     aSign = extractFloatx80Sign( a );
6153     bSig = extractFloatx80Frac( b );
6154     bExp = extractFloatx80Exp( b );
6155     if ( aExp == 0x7FFF ) {
6156         if (    (uint64_t) ( aSig0<<1 )
6157              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
6158             return propagateFloatx80NaN(a, b, status);
6159         }
6160         goto invalid;
6161     }
6162     if ( bExp == 0x7FFF ) {
6163         if ((uint64_t)(bSig << 1)) {
6164             return propagateFloatx80NaN(a, b, status);
6165         }
6166         return a;
6167     }
6168     if ( bExp == 0 ) {
6169         if ( bSig == 0 ) {
6170  invalid:
6171             float_raise(float_flag_invalid, status);
6172             return floatx80_default_nan(status);
6173         }
6174         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
6175     }
6176     if ( aExp == 0 ) {
6177         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
6178         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6179     }
6180     bSig |= LIT64( 0x8000000000000000 );
6181     zSign = aSign;
6182     expDiff = aExp - bExp;
6183     aSig1 = 0;
6184     if ( expDiff < 0 ) {
6185         if ( expDiff < -1 ) return a;
6186         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
6187         expDiff = 0;
6188     }
6189     q = ( bSig <= aSig0 );
6190     if ( q ) aSig0 -= bSig;
6191     expDiff -= 64;
6192     while ( 0 < expDiff ) {
6193         q = estimateDiv128To64( aSig0, aSig1, bSig );
6194         q = ( 2 < q ) ? q - 2 : 0;
6195         mul64To128( bSig, q, &term0, &term1 );
6196         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6197         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
6198         expDiff -= 62;
6199     }
6200     expDiff += 64;
6201     if ( 0 < expDiff ) {
6202         q = estimateDiv128To64( aSig0, aSig1, bSig );
6203         q = ( 2 < q ) ? q - 2 : 0;
6204         q >>= 64 - expDiff;
6205         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
6206         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6207         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
6208         while ( le128( term0, term1, aSig0, aSig1 ) ) {
6209             ++q;
6210             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
6211         }
6212     }
6213     else {
6214         term1 = 0;
6215         term0 = bSig;
6216     }
6217     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
6218     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
6219          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
6220               && ( q & 1 ) )
6221        ) {
6222         aSig0 = alternateASig0;
6223         aSig1 = alternateASig1;
6224         zSign = ! zSign;
6225     }
6226     return
6227         normalizeRoundAndPackFloatx80(
6228             80, zSign, bExp + expDiff, aSig0, aSig1, status);
6229 
6230 }
6231 
6232 /*----------------------------------------------------------------------------
6233 | Returns the square root of the extended double-precision floating-point
6234 | value `a'.  The operation is performed according to the IEC/IEEE Standard
6235 | for Binary Floating-Point Arithmetic.
6236 *----------------------------------------------------------------------------*/
6237 
6238 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
6239 {
6240     flag aSign;
6241     int32_t aExp, zExp;
6242     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
6243     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6244 
6245     if (floatx80_invalid_encoding(a)) {
6246         float_raise(float_flag_invalid, status);
6247         return floatx80_default_nan(status);
6248     }
6249     aSig0 = extractFloatx80Frac( a );
6250     aExp = extractFloatx80Exp( a );
6251     aSign = extractFloatx80Sign( a );
6252     if ( aExp == 0x7FFF ) {
6253         if ((uint64_t)(aSig0 << 1)) {
6254             return propagateFloatx80NaN(a, a, status);
6255         }
6256         if ( ! aSign ) return a;
6257         goto invalid;
6258     }
6259     if ( aSign ) {
6260         if ( ( aExp | aSig0 ) == 0 ) return a;
6261  invalid:
6262         float_raise(float_flag_invalid, status);
6263         return floatx80_default_nan(status);
6264     }
6265     if ( aExp == 0 ) {
6266         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
6267         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
6268     }
6269     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
6270     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
6271     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
6272     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6273     doubleZSig0 = zSig0<<1;
6274     mul64To128( zSig0, zSig0, &term0, &term1 );
6275     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6276     while ( (int64_t) rem0 < 0 ) {
6277         --zSig0;
6278         doubleZSig0 -= 2;
6279         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6280     }
6281     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6282     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
6283         if ( zSig1 == 0 ) zSig1 = 1;
6284         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6285         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6286         mul64To128( zSig1, zSig1, &term2, &term3 );
6287         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6288         while ( (int64_t) rem1 < 0 ) {
6289             --zSig1;
6290             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6291             term3 |= 1;
6292             term2 |= doubleZSig0;
6293             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6294         }
6295         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6296     }
6297     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
6298     zSig0 |= doubleZSig0;
6299     return roundAndPackFloatx80(status->floatx80_rounding_precision,
6300                                 0, zExp, zSig0, zSig1, status);
6301 }
6302 
6303 /*----------------------------------------------------------------------------
6304 | Returns 1 if the extended double-precision floating-point value `a' is equal
6305 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
6306 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6308 *----------------------------------------------------------------------------*/
6309 
6310 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
6311 {
6312 
6313     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6314         || (extractFloatx80Exp(a) == 0x7FFF
6315             && (uint64_t) (extractFloatx80Frac(a) << 1))
6316         || (extractFloatx80Exp(b) == 0x7FFF
6317             && (uint64_t) (extractFloatx80Frac(b) << 1))
6318        ) {
6319         float_raise(float_flag_invalid, status);
6320         return 0;
6321     }
6322     return
6323            ( a.low == b.low )
6324         && (    ( a.high == b.high )
6325              || (    ( a.low == 0 )
6326                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6327            );
6328 
6329 }
6330 
6331 /*----------------------------------------------------------------------------
6332 | Returns 1 if the extended double-precision floating-point value `a' is
6333 | less than or equal to the corresponding value `b', and 0 otherwise.  The
6334 | invalid exception is raised if either operand is a NaN.  The comparison is
6335 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6336 | Arithmetic.
6337 *----------------------------------------------------------------------------*/
6338 
6339 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
6340 {
6341     flag aSign, bSign;
6342 
6343     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6344         || (extractFloatx80Exp(a) == 0x7FFF
6345             && (uint64_t) (extractFloatx80Frac(a) << 1))
6346         || (extractFloatx80Exp(b) == 0x7FFF
6347             && (uint64_t) (extractFloatx80Frac(b) << 1))
6348        ) {
6349         float_raise(float_flag_invalid, status);
6350         return 0;
6351     }
6352     aSign = extractFloatx80Sign( a );
6353     bSign = extractFloatx80Sign( b );
6354     if ( aSign != bSign ) {
6355         return
6356                aSign
6357             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6358                  == 0 );
6359     }
6360     return
6361           aSign ? le128( b.high, b.low, a.high, a.low )
6362         : le128( a.high, a.low, b.high, b.low );
6363 
6364 }
6365 
6366 /*----------------------------------------------------------------------------
6367 | Returns 1 if the extended double-precision floating-point value `a' is
6368 | less than the corresponding value `b', and 0 otherwise.  The invalid
6369 | exception is raised if either operand is a NaN.  The comparison is performed
6370 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6371 *----------------------------------------------------------------------------*/
6372 
6373 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
6374 {
6375     flag aSign, bSign;
6376 
6377     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6378         || (extractFloatx80Exp(a) == 0x7FFF
6379             && (uint64_t) (extractFloatx80Frac(a) << 1))
6380         || (extractFloatx80Exp(b) == 0x7FFF
6381             && (uint64_t) (extractFloatx80Frac(b) << 1))
6382        ) {
6383         float_raise(float_flag_invalid, status);
6384         return 0;
6385     }
6386     aSign = extractFloatx80Sign( a );
6387     bSign = extractFloatx80Sign( b );
6388     if ( aSign != bSign ) {
6389         return
6390                aSign
6391             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6392                  != 0 );
6393     }
6394     return
6395           aSign ? lt128( b.high, b.low, a.high, a.low )
6396         : lt128( a.high, a.low, b.high, b.low );
6397 
6398 }
6399 
6400 /*----------------------------------------------------------------------------
6401 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6402 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
6403 | either operand is a NaN.   The comparison is performed according to the
6404 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6405 *----------------------------------------------------------------------------*/
6406 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
6407 {
6408     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
6409         || (extractFloatx80Exp(a) == 0x7FFF
6410             && (uint64_t) (extractFloatx80Frac(a) << 1))
6411         || (extractFloatx80Exp(b) == 0x7FFF
6412             && (uint64_t) (extractFloatx80Frac(b) << 1))
6413        ) {
6414         float_raise(float_flag_invalid, status);
6415         return 1;
6416     }
6417     return 0;
6418 }
6419 
6420 /*----------------------------------------------------------------------------
6421 | Returns 1 if the extended double-precision floating-point value `a' is
6422 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6423 | cause an exception.  The comparison is performed according to the IEC/IEEE
6424 | Standard for Binary Floating-Point Arithmetic.
6425 *----------------------------------------------------------------------------*/
6426 
6427 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
6428 {
6429 
6430     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6431         float_raise(float_flag_invalid, status);
6432         return 0;
6433     }
6434     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6435               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6436          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6437               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6438        ) {
6439         if (floatx80_is_signaling_nan(a, status)
6440          || floatx80_is_signaling_nan(b, status)) {
6441             float_raise(float_flag_invalid, status);
6442         }
6443         return 0;
6444     }
6445     return
6446            ( a.low == b.low )
6447         && (    ( a.high == b.high )
6448              || (    ( a.low == 0 )
6449                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6450            );
6451 
6452 }
6453 
6454 /*----------------------------------------------------------------------------
6455 | Returns 1 if the extended double-precision floating-point value `a' is less
6456 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
6457 | do not cause an exception.  Otherwise, the comparison is performed according
6458 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6460 
6461 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
6462 {
6463     flag aSign, bSign;
6464 
6465     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6466         float_raise(float_flag_invalid, status);
6467         return 0;
6468     }
6469     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6470               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6471          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6472               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6473        ) {
6474         if (floatx80_is_signaling_nan(a, status)
6475          || floatx80_is_signaling_nan(b, status)) {
6476             float_raise(float_flag_invalid, status);
6477         }
6478         return 0;
6479     }
6480     aSign = extractFloatx80Sign( a );
6481     bSign = extractFloatx80Sign( b );
6482     if ( aSign != bSign ) {
6483         return
6484                aSign
6485             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6486                  == 0 );
6487     }
6488     return
6489           aSign ? le128( b.high, b.low, a.high, a.low )
6490         : le128( a.high, a.low, b.high, b.low );
6491 
6492 }
6493 
6494 /*----------------------------------------------------------------------------
6495 | Returns 1 if the extended double-precision floating-point value `a' is less
6496 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
6497 | an exception.  Otherwise, the comparison is performed according to the
6498 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6499 *----------------------------------------------------------------------------*/
6500 
6501 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
6502 {
6503     flag aSign, bSign;
6504 
6505     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6506         float_raise(float_flag_invalid, status);
6507         return 0;
6508     }
6509     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6510               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6511          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6512               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6513        ) {
6514         if (floatx80_is_signaling_nan(a, status)
6515          || floatx80_is_signaling_nan(b, status)) {
6516             float_raise(float_flag_invalid, status);
6517         }
6518         return 0;
6519     }
6520     aSign = extractFloatx80Sign( a );
6521     bSign = extractFloatx80Sign( b );
6522     if ( aSign != bSign ) {
6523         return
6524                aSign
6525             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6526                  != 0 );
6527     }
6528     return
6529           aSign ? lt128( b.high, b.low, a.high, a.low )
6530         : lt128( a.high, a.low, b.high, b.low );
6531 
6532 }
6533 
6534 /*----------------------------------------------------------------------------
6535 | Returns 1 if the extended double-precision floating-point values `a' and `b'
6536 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
6537 | The comparison is performed according to the IEC/IEEE Standard for Binary
6538 | Floating-Point Arithmetic.
6539 *----------------------------------------------------------------------------*/
6540 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
6541 {
6542     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
6543         float_raise(float_flag_invalid, status);
6544         return 1;
6545     }
6546     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
6547               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
6548          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
6549               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
6550        ) {
6551         if (floatx80_is_signaling_nan(a, status)
6552          || floatx80_is_signaling_nan(b, status)) {
6553             float_raise(float_flag_invalid, status);
6554         }
6555         return 1;
6556     }
6557     return 0;
6558 }
6559 
6560 /*----------------------------------------------------------------------------
6561 | Returns the result of converting the quadruple-precision floating-point
6562 | value `a' to the 32-bit two's complement integer format.  The conversion
6563 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6564 | Arithmetic---which means in particular that the conversion is rounded
6565 | according to the current rounding mode.  If `a' is a NaN, the largest
6566 | positive integer is returned.  Otherwise, if the conversion overflows, the
6567 | largest integer with the same sign as `a' is returned.
6568 *----------------------------------------------------------------------------*/
6569 
6570 int32_t float128_to_int32(float128 a, float_status *status)
6571 {
6572     flag aSign;
6573     int32_t aExp, shiftCount;
6574     uint64_t aSig0, aSig1;
6575 
6576     aSig1 = extractFloat128Frac1( a );
6577     aSig0 = extractFloat128Frac0( a );
6578     aExp = extractFloat128Exp( a );
6579     aSign = extractFloat128Sign( a );
6580     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
6581     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6582     aSig0 |= ( aSig1 != 0 );
6583     shiftCount = 0x4028 - aExp;
6584     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
6585     return roundAndPackInt32(aSign, aSig0, status);
6586 
6587 }
6588 
6589 /*----------------------------------------------------------------------------
6590 | Returns the result of converting the quadruple-precision floating-point
6591 | value `a' to the 32-bit two's complement integer format.  The conversion
6592 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6593 | Arithmetic, except that the conversion is always rounded toward zero.  If
6594 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
6595 | conversion overflows, the largest integer with the same sign as `a' is
6596 | returned.
6597 *----------------------------------------------------------------------------*/
6598 
6599 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6600 {
6601     flag aSign;
6602     int32_t aExp, shiftCount;
6603     uint64_t aSig0, aSig1, savedASig;
6604     int32_t z;
6605 
6606     aSig1 = extractFloat128Frac1( a );
6607     aSig0 = extractFloat128Frac0( a );
6608     aExp = extractFloat128Exp( a );
6609     aSign = extractFloat128Sign( a );
6610     aSig0 |= ( aSig1 != 0 );
6611     if ( 0x401E < aExp ) {
6612         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6613         goto invalid;
6614     }
6615     else if ( aExp < 0x3FFF ) {
6616         if (aExp || aSig0) {
6617             status->float_exception_flags |= float_flag_inexact;
6618         }
6619         return 0;
6620     }
6621     aSig0 |= LIT64( 0x0001000000000000 );
6622     shiftCount = 0x402F - aExp;
6623     savedASig = aSig0;
6624     aSig0 >>= shiftCount;
6625     z = aSig0;
6626     if ( aSign ) z = - z;
6627     if ( ( z < 0 ) ^ aSign ) {
6628  invalid:
6629         float_raise(float_flag_invalid, status);
6630         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6631     }
6632     if ( ( aSig0<<shiftCount ) != savedASig ) {
6633         status->float_exception_flags |= float_flag_inexact;
6634     }
6635     return z;
6636 
6637 }
6638 
6639 /*----------------------------------------------------------------------------
6640 | Returns the result of converting the quadruple-precision floating-point
6641 | value `a' to the 64-bit two's complement integer format.  The conversion
6642 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6643 | Arithmetic---which means in particular that the conversion is rounded
6644 | according to the current rounding mode.  If `a' is a NaN, the largest
6645 | positive integer is returned.  Otherwise, if the conversion overflows, the
6646 | largest integer with the same sign as `a' is returned.
6647 *----------------------------------------------------------------------------*/
6648 
6649 int64_t float128_to_int64(float128 a, float_status *status)
6650 {
6651     flag aSign;
6652     int32_t aExp, shiftCount;
6653     uint64_t aSig0, aSig1;
6654 
6655     aSig1 = extractFloat128Frac1( a );
6656     aSig0 = extractFloat128Frac0( a );
6657     aExp = extractFloat128Exp( a );
6658     aSign = extractFloat128Sign( a );
6659     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6660     shiftCount = 0x402F - aExp;
6661     if ( shiftCount <= 0 ) {
6662         if ( 0x403E < aExp ) {
6663             float_raise(float_flag_invalid, status);
6664             if (    ! aSign
6665                  || (    ( aExp == 0x7FFF )
6666                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6667                     )
6668                ) {
6669                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6670             }
6671             return (int64_t) LIT64( 0x8000000000000000 );
6672         }
6673         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6674     }
6675     else {
6676         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6677     }
6678     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6679 
6680 }
6681 
6682 /*----------------------------------------------------------------------------
6683 | Returns the result of converting the quadruple-precision floating-point
6684 | value `a' to the 64-bit two's complement integer format.  The conversion
6685 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6686 | Arithmetic, except that the conversion is always rounded toward zero.
6687 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6688 | the conversion overflows, the largest integer with the same sign as `a' is
6689 | returned.
6690 *----------------------------------------------------------------------------*/
6691 
6692 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6693 {
6694     flag aSign;
6695     int32_t aExp, shiftCount;
6696     uint64_t aSig0, aSig1;
6697     int64_t z;
6698 
6699     aSig1 = extractFloat128Frac1( a );
6700     aSig0 = extractFloat128Frac0( a );
6701     aExp = extractFloat128Exp( a );
6702     aSign = extractFloat128Sign( a );
6703     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6704     shiftCount = aExp - 0x402F;
6705     if ( 0 < shiftCount ) {
6706         if ( 0x403E <= aExp ) {
6707             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6708             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6709                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6710                 if (aSig1) {
6711                     status->float_exception_flags |= float_flag_inexact;
6712                 }
6713             }
6714             else {
6715                 float_raise(float_flag_invalid, status);
6716                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6717                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6718                 }
6719             }
6720             return (int64_t) LIT64( 0x8000000000000000 );
6721         }
6722         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6723         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6724             status->float_exception_flags |= float_flag_inexact;
6725         }
6726     }
6727     else {
6728         if ( aExp < 0x3FFF ) {
6729             if ( aExp | aSig0 | aSig1 ) {
6730                 status->float_exception_flags |= float_flag_inexact;
6731             }
6732             return 0;
6733         }
6734         z = aSig0>>( - shiftCount );
6735         if (    aSig1
6736              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6737             status->float_exception_flags |= float_flag_inexact;
6738         }
6739     }
6740     if ( aSign ) z = - z;
6741     return z;
6742 
6743 }
6744 
6745 /*----------------------------------------------------------------------------
6746 | Returns the result of converting the quadruple-precision floating-point value
6747 | `a' to the 64-bit unsigned integer format.  The conversion is
6748 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6749 | Arithmetic---which means in particular that the conversion is rounded
6750 | according to the current rounding mode.  If `a' is a NaN, the largest
6751 | positive integer is returned.  If the conversion overflows, the
6752 | largest unsigned integer is returned.  If 'a' is negative, the value is
6753 | rounded and zero is returned; negative values that do not round to zero
6754 | will raise the inexact exception.
6755 *----------------------------------------------------------------------------*/
6756 
6757 uint64_t float128_to_uint64(float128 a, float_status *status)
6758 {
6759     flag aSign;
6760     int aExp;
6761     int shiftCount;
6762     uint64_t aSig0, aSig1;
6763 
6764     aSig0 = extractFloat128Frac0(a);
6765     aSig1 = extractFloat128Frac1(a);
6766     aExp = extractFloat128Exp(a);
6767     aSign = extractFloat128Sign(a);
6768     if (aSign && (aExp > 0x3FFE)) {
6769         float_raise(float_flag_invalid, status);
6770         if (float128_is_any_nan(a)) {
6771             return LIT64(0xFFFFFFFFFFFFFFFF);
6772         } else {
6773             return 0;
6774         }
6775     }
6776     if (aExp) {
6777         aSig0 |= LIT64(0x0001000000000000);
6778     }
6779     shiftCount = 0x402F - aExp;
6780     if (shiftCount <= 0) {
6781         if (0x403E < aExp) {
6782             float_raise(float_flag_invalid, status);
6783             return LIT64(0xFFFFFFFFFFFFFFFF);
6784         }
6785         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6786     } else {
6787         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6788     }
6789     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6790 }
6791 
6792 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6793 {
6794     uint64_t v;
6795     signed char current_rounding_mode = status->float_rounding_mode;
6796 
6797     set_float_rounding_mode(float_round_to_zero, status);
6798     v = float128_to_uint64(a, status);
6799     set_float_rounding_mode(current_rounding_mode, status);
6800 
6801     return v;
6802 }
6803 
6804 /*----------------------------------------------------------------------------
6805 | Returns the result of converting the quadruple-precision floating-point
6806 | value `a' to the 32-bit unsigned integer format.  The conversion
6807 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6808 | Arithmetic except that the conversion is always rounded toward zero.
6809 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6810 | if the conversion overflows, the largest unsigned integer is returned.
6811 | If 'a' is negative, the value is rounded and zero is returned; negative
6812 | values that do not round to zero will raise the inexact exception.
6813 *----------------------------------------------------------------------------*/
6814 
6815 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6816 {
6817     uint64_t v;
6818     uint32_t res;
6819     int old_exc_flags = get_float_exception_flags(status);
6820 
6821     v = float128_to_uint64_round_to_zero(a, status);
6822     if (v > 0xffffffff) {
6823         res = 0xffffffff;
6824     } else {
6825         return v;
6826     }
6827     set_float_exception_flags(old_exc_flags, status);
6828     float_raise(float_flag_invalid, status);
6829     return res;
6830 }
6831 
6832 /*----------------------------------------------------------------------------
6833 | Returns the result of converting the quadruple-precision floating-point value
6834 | `a' to the 32-bit unsigned integer format.  The conversion is
6835 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6836 | Arithmetic---which means in particular that the conversion is rounded
6837 | according to the current rounding mode.  If `a' is a NaN, the largest
6838 | positive integer is returned.  If the conversion overflows, the
6839 | largest unsigned integer is returned.  If 'a' is negative, the value is
6840 | rounded and zero is returned; negative values that do not round to zero
6841 | will raise the inexact exception.
6842 *----------------------------------------------------------------------------*/
6843 
6844 uint32_t float128_to_uint32(float128 a, float_status *status)
6845 {
6846     uint64_t v;
6847     uint32_t res;
6848     int old_exc_flags = get_float_exception_flags(status);
6849 
6850     v = float128_to_uint64(a, status);
6851     if (v > 0xffffffff) {
6852         res = 0xffffffff;
6853     } else {
6854         return v;
6855     }
6856     set_float_exception_flags(old_exc_flags, status);
6857     float_raise(float_flag_invalid, status);
6858     return res;
6859 }
6860 
6861 /*----------------------------------------------------------------------------
6862 | Returns the result of converting the quadruple-precision floating-point
6863 | value `a' to the single-precision floating-point format.  The conversion
6864 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6865 | Arithmetic.
6866 *----------------------------------------------------------------------------*/
6867 
6868 float32 float128_to_float32(float128 a, float_status *status)
6869 {
6870     flag aSign;
6871     int32_t aExp;
6872     uint64_t aSig0, aSig1;
6873     uint32_t zSig;
6874 
6875     aSig1 = extractFloat128Frac1( a );
6876     aSig0 = extractFloat128Frac0( a );
6877     aExp = extractFloat128Exp( a );
6878     aSign = extractFloat128Sign( a );
6879     if ( aExp == 0x7FFF ) {
6880         if ( aSig0 | aSig1 ) {
6881             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6882         }
6883         return packFloat32( aSign, 0xFF, 0 );
6884     }
6885     aSig0 |= ( aSig1 != 0 );
6886     shift64RightJamming( aSig0, 18, &aSig0 );
6887     zSig = aSig0;
6888     if ( aExp || zSig ) {
6889         zSig |= 0x40000000;
6890         aExp -= 0x3F81;
6891     }
6892     return roundAndPackFloat32(aSign, aExp, zSig, status);
6893 
6894 }
6895 
6896 /*----------------------------------------------------------------------------
6897 | Returns the result of converting the quadruple-precision floating-point
6898 | value `a' to the double-precision floating-point format.  The conversion
6899 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6900 | Arithmetic.
6901 *----------------------------------------------------------------------------*/
6902 
6903 float64 float128_to_float64(float128 a, float_status *status)
6904 {
6905     flag aSign;
6906     int32_t aExp;
6907     uint64_t aSig0, aSig1;
6908 
6909     aSig1 = extractFloat128Frac1( a );
6910     aSig0 = extractFloat128Frac0( a );
6911     aExp = extractFloat128Exp( a );
6912     aSign = extractFloat128Sign( a );
6913     if ( aExp == 0x7FFF ) {
6914         if ( aSig0 | aSig1 ) {
6915             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6916         }
6917         return packFloat64( aSign, 0x7FF, 0 );
6918     }
6919     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6920     aSig0 |= ( aSig1 != 0 );
6921     if ( aExp || aSig0 ) {
6922         aSig0 |= LIT64( 0x4000000000000000 );
6923         aExp -= 0x3C01;
6924     }
6925     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6926 
6927 }
6928 
6929 /*----------------------------------------------------------------------------
6930 | Returns the result of converting the quadruple-precision floating-point
6931 | value `a' to the extended double-precision floating-point format.  The
6932 | conversion is performed according to the IEC/IEEE Standard for Binary
6933 | Floating-Point Arithmetic.
6934 *----------------------------------------------------------------------------*/
6935 
6936 floatx80 float128_to_floatx80(float128 a, float_status *status)
6937 {
6938     flag aSign;
6939     int32_t aExp;
6940     uint64_t aSig0, aSig1;
6941 
6942     aSig1 = extractFloat128Frac1( a );
6943     aSig0 = extractFloat128Frac0( a );
6944     aExp = extractFloat128Exp( a );
6945     aSign = extractFloat128Sign( a );
6946     if ( aExp == 0x7FFF ) {
6947         if ( aSig0 | aSig1 ) {
6948             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6949         }
6950         return packFloatx80(aSign, floatx80_infinity_high,
6951                                    floatx80_infinity_low);
6952     }
6953     if ( aExp == 0 ) {
6954         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6955         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6956     }
6957     else {
6958         aSig0 |= LIT64( 0x0001000000000000 );
6959     }
6960     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6961     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6962 
6963 }
6964 
6965 /*----------------------------------------------------------------------------
6966 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6967 | returns the result as a quadruple-precision floating-point value.  The
6968 | operation is performed according to the IEC/IEEE Standard for Binary
6969 | Floating-Point Arithmetic.
6970 *----------------------------------------------------------------------------*/
6971 
6972 float128 float128_round_to_int(float128 a, float_status *status)
6973 {
6974     flag aSign;
6975     int32_t aExp;
6976     uint64_t lastBitMask, roundBitsMask;
6977     float128 z;
6978 
6979     aExp = extractFloat128Exp( a );
6980     if ( 0x402F <= aExp ) {
6981         if ( 0x406F <= aExp ) {
6982             if (    ( aExp == 0x7FFF )
6983                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6984                ) {
6985                 return propagateFloat128NaN(a, a, status);
6986             }
6987             return a;
6988         }
6989         lastBitMask = 1;
6990         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6991         roundBitsMask = lastBitMask - 1;
6992         z = a;
6993         switch (status->float_rounding_mode) {
6994         case float_round_nearest_even:
6995             if ( lastBitMask ) {
6996                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6997                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6998             }
6999             else {
7000                 if ( (int64_t) z.low < 0 ) {
7001                     ++z.high;
7002                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
7003                 }
7004             }
7005             break;
7006         case float_round_ties_away:
7007             if (lastBitMask) {
7008                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
7009             } else {
7010                 if ((int64_t) z.low < 0) {
7011                     ++z.high;
7012                 }
7013             }
7014             break;
7015         case float_round_to_zero:
7016             break;
7017         case float_round_up:
7018             if (!extractFloat128Sign(z)) {
7019                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7020             }
7021             break;
7022         case float_round_down:
7023             if (extractFloat128Sign(z)) {
7024                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7025             }
7026             break;
7027         case float_round_to_odd:
7028             /*
7029              * Note that if lastBitMask == 0, the last bit is the lsb
7030              * of high, and roundBitsMask == -1.
7031              */
7032             if ((lastBitMask ? z.low & lastBitMask : z.high & 1) == 0) {
7033                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
7034             }
7035             break;
7036         default:
7037             abort();
7038         }
7039         z.low &= ~ roundBitsMask;
7040     }
7041     else {
7042         if ( aExp < 0x3FFF ) {
7043             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
7044             status->float_exception_flags |= float_flag_inexact;
7045             aSign = extractFloat128Sign( a );
7046             switch (status->float_rounding_mode) {
7047             case float_round_nearest_even:
7048                 if (    ( aExp == 0x3FFE )
7049                      && (   extractFloat128Frac0( a )
7050                           | extractFloat128Frac1( a ) )
7051                    ) {
7052                     return packFloat128( aSign, 0x3FFF, 0, 0 );
7053                 }
7054                 break;
7055             case float_round_ties_away:
7056                 if (aExp == 0x3FFE) {
7057                     return packFloat128(aSign, 0x3FFF, 0, 0);
7058                 }
7059                 break;
7060             case float_round_down:
7061                 return
7062                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
7063                     : packFloat128( 0, 0, 0, 0 );
7064             case float_round_up:
7065                 return
7066                       aSign ? packFloat128( 1, 0, 0, 0 )
7067                     : packFloat128( 0, 0x3FFF, 0, 0 );
7068 
7069             case float_round_to_odd:
7070                 return packFloat128(aSign, 0x3FFF, 0, 0);
7071             }
7072             return packFloat128( aSign, 0, 0, 0 );
7073         }
7074         lastBitMask = 1;
7075         lastBitMask <<= 0x402F - aExp;
7076         roundBitsMask = lastBitMask - 1;
7077         z.low = 0;
7078         z.high = a.high;
7079         switch (status->float_rounding_mode) {
7080         case float_round_nearest_even:
7081             z.high += lastBitMask>>1;
7082             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
7083                 z.high &= ~ lastBitMask;
7084             }
7085             break;
7086         case float_round_ties_away:
7087             z.high += lastBitMask>>1;
7088             break;
7089         case float_round_to_zero:
7090             break;
7091         case float_round_up:
7092             if (!extractFloat128Sign(z)) {
7093                 z.high |= ( a.low != 0 );
7094                 z.high += roundBitsMask;
7095             }
7096             break;
7097         case float_round_down:
7098             if (extractFloat128Sign(z)) {
7099                 z.high |= (a.low != 0);
7100                 z.high += roundBitsMask;
7101             }
7102             break;
7103         case float_round_to_odd:
7104             if ((z.high & lastBitMask) == 0) {
7105                 z.high |= (a.low != 0);
7106                 z.high += roundBitsMask;
7107             }
7108             break;
7109         default:
7110             abort();
7111         }
7112         z.high &= ~ roundBitsMask;
7113     }
7114     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
7115         status->float_exception_flags |= float_flag_inexact;
7116     }
7117     return z;
7118 
7119 }
7120 
7121 /*----------------------------------------------------------------------------
7122 | Returns the result of adding the absolute values of the quadruple-precision
7123 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
7124 | before being returned.  `zSign' is ignored if the result is a NaN.
7125 | The addition is performed according to the IEC/IEEE Standard for Binary
7126 | Floating-Point Arithmetic.
7127 *----------------------------------------------------------------------------*/
7128 
7129 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
7130                                 float_status *status)
7131 {
7132     int32_t aExp, bExp, zExp;
7133     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7134     int32_t expDiff;
7135 
7136     aSig1 = extractFloat128Frac1( a );
7137     aSig0 = extractFloat128Frac0( a );
7138     aExp = extractFloat128Exp( a );
7139     bSig1 = extractFloat128Frac1( b );
7140     bSig0 = extractFloat128Frac0( b );
7141     bExp = extractFloat128Exp( b );
7142     expDiff = aExp - bExp;
7143     if ( 0 < expDiff ) {
7144         if ( aExp == 0x7FFF ) {
7145             if (aSig0 | aSig1) {
7146                 return propagateFloat128NaN(a, b, status);
7147             }
7148             return a;
7149         }
7150         if ( bExp == 0 ) {
7151             --expDiff;
7152         }
7153         else {
7154             bSig0 |= LIT64( 0x0001000000000000 );
7155         }
7156         shift128ExtraRightJamming(
7157             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
7158         zExp = aExp;
7159     }
7160     else if ( expDiff < 0 ) {
7161         if ( bExp == 0x7FFF ) {
7162             if (bSig0 | bSig1) {
7163                 return propagateFloat128NaN(a, b, status);
7164             }
7165             return packFloat128( zSign, 0x7FFF, 0, 0 );
7166         }
7167         if ( aExp == 0 ) {
7168             ++expDiff;
7169         }
7170         else {
7171             aSig0 |= LIT64( 0x0001000000000000 );
7172         }
7173         shift128ExtraRightJamming(
7174             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
7175         zExp = bExp;
7176     }
7177     else {
7178         if ( aExp == 0x7FFF ) {
7179             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7180                 return propagateFloat128NaN(a, b, status);
7181             }
7182             return a;
7183         }
7184         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7185         if ( aExp == 0 ) {
7186             if (status->flush_to_zero) {
7187                 if (zSig0 | zSig1) {
7188                     float_raise(float_flag_output_denormal, status);
7189                 }
7190                 return packFloat128(zSign, 0, 0, 0);
7191             }
7192             return packFloat128( zSign, 0, zSig0, zSig1 );
7193         }
7194         zSig2 = 0;
7195         zSig0 |= LIT64( 0x0002000000000000 );
7196         zExp = aExp;
7197         goto shiftRight1;
7198     }
7199     aSig0 |= LIT64( 0x0001000000000000 );
7200     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7201     --zExp;
7202     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
7203     ++zExp;
7204  shiftRight1:
7205     shift128ExtraRightJamming(
7206         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7207  roundAndPack:
7208     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7209 
7210 }
7211 
7212 /*----------------------------------------------------------------------------
7213 | Returns the result of subtracting the absolute values of the quadruple-
7214 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
7215 | difference is negated before being returned.  `zSign' is ignored if the
7216 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
7217 | Standard for Binary Floating-Point Arithmetic.
7218 *----------------------------------------------------------------------------*/
7219 
7220 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
7221                                 float_status *status)
7222 {
7223     int32_t aExp, bExp, zExp;
7224     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
7225     int32_t expDiff;
7226 
7227     aSig1 = extractFloat128Frac1( a );
7228     aSig0 = extractFloat128Frac0( a );
7229     aExp = extractFloat128Exp( a );
7230     bSig1 = extractFloat128Frac1( b );
7231     bSig0 = extractFloat128Frac0( b );
7232     bExp = extractFloat128Exp( b );
7233     expDiff = aExp - bExp;
7234     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
7235     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
7236     if ( 0 < expDiff ) goto aExpBigger;
7237     if ( expDiff < 0 ) goto bExpBigger;
7238     if ( aExp == 0x7FFF ) {
7239         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
7240             return propagateFloat128NaN(a, b, status);
7241         }
7242         float_raise(float_flag_invalid, status);
7243         return float128_default_nan(status);
7244     }
7245     if ( aExp == 0 ) {
7246         aExp = 1;
7247         bExp = 1;
7248     }
7249     if ( bSig0 < aSig0 ) goto aBigger;
7250     if ( aSig0 < bSig0 ) goto bBigger;
7251     if ( bSig1 < aSig1 ) goto aBigger;
7252     if ( aSig1 < bSig1 ) goto bBigger;
7253     return packFloat128(status->float_rounding_mode == float_round_down,
7254                         0, 0, 0);
7255  bExpBigger:
7256     if ( bExp == 0x7FFF ) {
7257         if (bSig0 | bSig1) {
7258             return propagateFloat128NaN(a, b, status);
7259         }
7260         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
7261     }
7262     if ( aExp == 0 ) {
7263         ++expDiff;
7264     }
7265     else {
7266         aSig0 |= LIT64( 0x4000000000000000 );
7267     }
7268     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7269     bSig0 |= LIT64( 0x4000000000000000 );
7270  bBigger:
7271     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
7272     zExp = bExp;
7273     zSign ^= 1;
7274     goto normalizeRoundAndPack;
7275  aExpBigger:
7276     if ( aExp == 0x7FFF ) {
7277         if (aSig0 | aSig1) {
7278             return propagateFloat128NaN(a, b, status);
7279         }
7280         return a;
7281     }
7282     if ( bExp == 0 ) {
7283         --expDiff;
7284     }
7285     else {
7286         bSig0 |= LIT64( 0x4000000000000000 );
7287     }
7288     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
7289     aSig0 |= LIT64( 0x4000000000000000 );
7290  aBigger:
7291     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
7292     zExp = aExp;
7293  normalizeRoundAndPack:
7294     --zExp;
7295     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
7296                                          status);
7297 
7298 }
7299 
7300 /*----------------------------------------------------------------------------
7301 | Returns the result of adding the quadruple-precision floating-point values
7302 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
7303 | for Binary Floating-Point Arithmetic.
7304 *----------------------------------------------------------------------------*/
7305 
7306 float128 float128_add(float128 a, float128 b, float_status *status)
7307 {
7308     flag aSign, bSign;
7309 
7310     aSign = extractFloat128Sign( a );
7311     bSign = extractFloat128Sign( b );
7312     if ( aSign == bSign ) {
7313         return addFloat128Sigs(a, b, aSign, status);
7314     }
7315     else {
7316         return subFloat128Sigs(a, b, aSign, status);
7317     }
7318 
7319 }
7320 
7321 /*----------------------------------------------------------------------------
7322 | Returns the result of subtracting the quadruple-precision floating-point
7323 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7324 | Standard for Binary Floating-Point Arithmetic.
7325 *----------------------------------------------------------------------------*/
7326 
7327 float128 float128_sub(float128 a, float128 b, float_status *status)
7328 {
7329     flag aSign, bSign;
7330 
7331     aSign = extractFloat128Sign( a );
7332     bSign = extractFloat128Sign( b );
7333     if ( aSign == bSign ) {
7334         return subFloat128Sigs(a, b, aSign, status);
7335     }
7336     else {
7337         return addFloat128Sigs(a, b, aSign, status);
7338     }
7339 
7340 }
7341 
7342 /*----------------------------------------------------------------------------
7343 | Returns the result of multiplying the quadruple-precision floating-point
7344 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
7345 | Standard for Binary Floating-Point Arithmetic.
7346 *----------------------------------------------------------------------------*/
7347 
7348 float128 float128_mul(float128 a, float128 b, float_status *status)
7349 {
7350     flag aSign, bSign, zSign;
7351     int32_t aExp, bExp, zExp;
7352     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
7353 
7354     aSig1 = extractFloat128Frac1( a );
7355     aSig0 = extractFloat128Frac0( a );
7356     aExp = extractFloat128Exp( a );
7357     aSign = extractFloat128Sign( a );
7358     bSig1 = extractFloat128Frac1( b );
7359     bSig0 = extractFloat128Frac0( b );
7360     bExp = extractFloat128Exp( b );
7361     bSign = extractFloat128Sign( b );
7362     zSign = aSign ^ bSign;
7363     if ( aExp == 0x7FFF ) {
7364         if (    ( aSig0 | aSig1 )
7365              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7366             return propagateFloat128NaN(a, b, status);
7367         }
7368         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
7369         return packFloat128( zSign, 0x7FFF, 0, 0 );
7370     }
7371     if ( bExp == 0x7FFF ) {
7372         if (bSig0 | bSig1) {
7373             return propagateFloat128NaN(a, b, status);
7374         }
7375         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7376  invalid:
7377             float_raise(float_flag_invalid, status);
7378             return float128_default_nan(status);
7379         }
7380         return packFloat128( zSign, 0x7FFF, 0, 0 );
7381     }
7382     if ( aExp == 0 ) {
7383         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7384         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7385     }
7386     if ( bExp == 0 ) {
7387         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7388         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7389     }
7390     zExp = aExp + bExp - 0x4000;
7391     aSig0 |= LIT64( 0x0001000000000000 );
7392     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
7393     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
7394     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
7395     zSig2 |= ( zSig3 != 0 );
7396     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
7397         shift128ExtraRightJamming(
7398             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
7399         ++zExp;
7400     }
7401     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7402 
7403 }
7404 
7405 /*----------------------------------------------------------------------------
7406 | Returns the result of dividing the quadruple-precision floating-point value
7407 | `a' by the corresponding value `b'.  The operation is performed according to
7408 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7409 *----------------------------------------------------------------------------*/
7410 
7411 float128 float128_div(float128 a, float128 b, float_status *status)
7412 {
7413     flag aSign, bSign, zSign;
7414     int32_t aExp, bExp, zExp;
7415     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
7416     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7417 
7418     aSig1 = extractFloat128Frac1( a );
7419     aSig0 = extractFloat128Frac0( a );
7420     aExp = extractFloat128Exp( a );
7421     aSign = extractFloat128Sign( a );
7422     bSig1 = extractFloat128Frac1( b );
7423     bSig0 = extractFloat128Frac0( b );
7424     bExp = extractFloat128Exp( b );
7425     bSign = extractFloat128Sign( b );
7426     zSign = aSign ^ bSign;
7427     if ( aExp == 0x7FFF ) {
7428         if (aSig0 | aSig1) {
7429             return propagateFloat128NaN(a, b, status);
7430         }
7431         if ( bExp == 0x7FFF ) {
7432             if (bSig0 | bSig1) {
7433                 return propagateFloat128NaN(a, b, status);
7434             }
7435             goto invalid;
7436         }
7437         return packFloat128( zSign, 0x7FFF, 0, 0 );
7438     }
7439     if ( bExp == 0x7FFF ) {
7440         if (bSig0 | bSig1) {
7441             return propagateFloat128NaN(a, b, status);
7442         }
7443         return packFloat128( zSign, 0, 0, 0 );
7444     }
7445     if ( bExp == 0 ) {
7446         if ( ( bSig0 | bSig1 ) == 0 ) {
7447             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
7448  invalid:
7449                 float_raise(float_flag_invalid, status);
7450                 return float128_default_nan(status);
7451             }
7452             float_raise(float_flag_divbyzero, status);
7453             return packFloat128( zSign, 0x7FFF, 0, 0 );
7454         }
7455         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7456     }
7457     if ( aExp == 0 ) {
7458         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
7459         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7460     }
7461     zExp = aExp - bExp + 0x3FFD;
7462     shortShift128Left(
7463         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
7464     shortShift128Left(
7465         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7466     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
7467         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
7468         ++zExp;
7469     }
7470     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
7471     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
7472     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
7473     while ( (int64_t) rem0 < 0 ) {
7474         --zSig0;
7475         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
7476     }
7477     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
7478     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
7479         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
7480         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
7481         while ( (int64_t) rem1 < 0 ) {
7482             --zSig1;
7483             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
7484         }
7485         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7486     }
7487     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
7488     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
7489 
7490 }
7491 
7492 /*----------------------------------------------------------------------------
7493 | Returns the remainder of the quadruple-precision floating-point value `a'
7494 | with respect to the corresponding value `b'.  The operation is performed
7495 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7496 *----------------------------------------------------------------------------*/
7497 
7498 float128 float128_rem(float128 a, float128 b, float_status *status)
7499 {
7500     flag aSign, zSign;
7501     int32_t aExp, bExp, expDiff;
7502     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
7503     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
7504     int64_t sigMean0;
7505 
7506     aSig1 = extractFloat128Frac1( a );
7507     aSig0 = extractFloat128Frac0( a );
7508     aExp = extractFloat128Exp( a );
7509     aSign = extractFloat128Sign( a );
7510     bSig1 = extractFloat128Frac1( b );
7511     bSig0 = extractFloat128Frac0( b );
7512     bExp = extractFloat128Exp( b );
7513     if ( aExp == 0x7FFF ) {
7514         if (    ( aSig0 | aSig1 )
7515              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
7516             return propagateFloat128NaN(a, b, status);
7517         }
7518         goto invalid;
7519     }
7520     if ( bExp == 0x7FFF ) {
7521         if (bSig0 | bSig1) {
7522             return propagateFloat128NaN(a, b, status);
7523         }
7524         return a;
7525     }
7526     if ( bExp == 0 ) {
7527         if ( ( bSig0 | bSig1 ) == 0 ) {
7528  invalid:
7529             float_raise(float_flag_invalid, status);
7530             return float128_default_nan(status);
7531         }
7532         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
7533     }
7534     if ( aExp == 0 ) {
7535         if ( ( aSig0 | aSig1 ) == 0 ) return a;
7536         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7537     }
7538     expDiff = aExp - bExp;
7539     if ( expDiff < -1 ) return a;
7540     shortShift128Left(
7541         aSig0 | LIT64( 0x0001000000000000 ),
7542         aSig1,
7543         15 - ( expDiff < 0 ),
7544         &aSig0,
7545         &aSig1
7546     );
7547     shortShift128Left(
7548         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
7549     q = le128( bSig0, bSig1, aSig0, aSig1 );
7550     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7551     expDiff -= 64;
7552     while ( 0 < expDiff ) {
7553         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7554         q = ( 4 < q ) ? q - 4 : 0;
7555         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7556         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
7557         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
7558         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
7559         expDiff -= 61;
7560     }
7561     if ( -64 < expDiff ) {
7562         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
7563         q = ( 4 < q ) ? q - 4 : 0;
7564         q >>= - expDiff;
7565         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7566         expDiff += 52;
7567         if ( expDiff < 0 ) {
7568             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
7569         }
7570         else {
7571             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
7572         }
7573         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
7574         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
7575     }
7576     else {
7577         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
7578         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
7579     }
7580     do {
7581         alternateASig0 = aSig0;
7582         alternateASig1 = aSig1;
7583         ++q;
7584         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
7585     } while ( 0 <= (int64_t) aSig0 );
7586     add128(
7587         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
7588     if (    ( sigMean0 < 0 )
7589          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
7590         aSig0 = alternateASig0;
7591         aSig1 = alternateASig1;
7592     }
7593     zSign = ( (int64_t) aSig0 < 0 );
7594     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
7595     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
7596                                          status);
7597 }
7598 
7599 /*----------------------------------------------------------------------------
7600 | Returns the square root of the quadruple-precision floating-point value `a'.
7601 | The operation is performed according to the IEC/IEEE Standard for Binary
7602 | Floating-Point Arithmetic.
7603 *----------------------------------------------------------------------------*/
7604 
7605 float128 float128_sqrt(float128 a, float_status *status)
7606 {
7607     flag aSign;
7608     int32_t aExp, zExp;
7609     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
7610     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
7611 
7612     aSig1 = extractFloat128Frac1( a );
7613     aSig0 = extractFloat128Frac0( a );
7614     aExp = extractFloat128Exp( a );
7615     aSign = extractFloat128Sign( a );
7616     if ( aExp == 0x7FFF ) {
7617         if (aSig0 | aSig1) {
7618             return propagateFloat128NaN(a, a, status);
7619         }
7620         if ( ! aSign ) return a;
7621         goto invalid;
7622     }
7623     if ( aSign ) {
7624         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
7625  invalid:
7626         float_raise(float_flag_invalid, status);
7627         return float128_default_nan(status);
7628     }
7629     if ( aExp == 0 ) {
7630         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
7631         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
7632     }
7633     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
7634     aSig0 |= LIT64( 0x0001000000000000 );
7635     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
7636     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
7637     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
7638     doubleZSig0 = zSig0<<1;
7639     mul64To128( zSig0, zSig0, &term0, &term1 );
7640     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
7641     while ( (int64_t) rem0 < 0 ) {
7642         --zSig0;
7643         doubleZSig0 -= 2;
7644         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
7645     }
7646     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7647     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7648         if ( zSig1 == 0 ) zSig1 = 1;
7649         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7650         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7651         mul64To128( zSig1, zSig1, &term2, &term3 );
7652         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7653         while ( (int64_t) rem1 < 0 ) {
7654             --zSig1;
7655             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7656             term3 |= 1;
7657             term2 |= doubleZSig0;
7658             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7659         }
7660         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7661     }
7662     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7663     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7664 
7665 }
7666 
7667 /*----------------------------------------------------------------------------
7668 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7669 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7670 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7671 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7672 *----------------------------------------------------------------------------*/
7673 
7674 int float128_eq(float128 a, float128 b, float_status *status)
7675 {
7676 
7677     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7678               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7679          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7680               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7681        ) {
7682         float_raise(float_flag_invalid, status);
7683         return 0;
7684     }
7685     return
7686            ( a.low == b.low )
7687         && (    ( a.high == b.high )
7688              || (    ( a.low == 0 )
7689                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7690            );
7691 
7692 }
7693 
7694 /*----------------------------------------------------------------------------
7695 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7696 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7697 | exception is raised if either operand is a NaN.  The comparison is performed
7698 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7699 *----------------------------------------------------------------------------*/
7700 
7701 int float128_le(float128 a, float128 b, float_status *status)
7702 {
7703     flag aSign, bSign;
7704 
7705     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7706               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7707          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7708               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7709        ) {
7710         float_raise(float_flag_invalid, status);
7711         return 0;
7712     }
7713     aSign = extractFloat128Sign( a );
7714     bSign = extractFloat128Sign( b );
7715     if ( aSign != bSign ) {
7716         return
7717                aSign
7718             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7719                  == 0 );
7720     }
7721     return
7722           aSign ? le128( b.high, b.low, a.high, a.low )
7723         : le128( a.high, a.low, b.high, b.low );
7724 
7725 }
7726 
7727 /*----------------------------------------------------------------------------
7728 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7729 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7730 | raised if either operand is a NaN.  The comparison is performed according
7731 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7732 *----------------------------------------------------------------------------*/
7733 
7734 int float128_lt(float128 a, float128 b, float_status *status)
7735 {
7736     flag aSign, bSign;
7737 
7738     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7739               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7740          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7741               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7742        ) {
7743         float_raise(float_flag_invalid, status);
7744         return 0;
7745     }
7746     aSign = extractFloat128Sign( a );
7747     bSign = extractFloat128Sign( b );
7748     if ( aSign != bSign ) {
7749         return
7750                aSign
7751             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7752                  != 0 );
7753     }
7754     return
7755           aSign ? lt128( b.high, b.low, a.high, a.low )
7756         : lt128( a.high, a.low, b.high, b.low );
7757 
7758 }
7759 
7760 /*----------------------------------------------------------------------------
7761 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7762 | be compared, and 0 otherwise.  The invalid exception is raised if either
7763 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7764 | Standard for Binary Floating-Point Arithmetic.
7765 *----------------------------------------------------------------------------*/
7766 
7767 int float128_unordered(float128 a, float128 b, float_status *status)
7768 {
7769     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7770               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7771          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7772               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7773        ) {
7774         float_raise(float_flag_invalid, status);
7775         return 1;
7776     }
7777     return 0;
7778 }
7779 
7780 /*----------------------------------------------------------------------------
7781 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7782 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7783 | exception.  The comparison is performed according to the IEC/IEEE Standard
7784 | for Binary Floating-Point Arithmetic.
7785 *----------------------------------------------------------------------------*/
7786 
7787 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7788 {
7789 
7790     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7791               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7792          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7793               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7794        ) {
7795         if (float128_is_signaling_nan(a, status)
7796          || float128_is_signaling_nan(b, status)) {
7797             float_raise(float_flag_invalid, status);
7798         }
7799         return 0;
7800     }
7801     return
7802            ( a.low == b.low )
7803         && (    ( a.high == b.high )
7804              || (    ( a.low == 0 )
7805                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7806            );
7807 
7808 }
7809 
7810 /*----------------------------------------------------------------------------
7811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7812 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7813 | cause an exception.  Otherwise, the comparison is performed according to the
7814 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7815 *----------------------------------------------------------------------------*/
7816 
7817 int float128_le_quiet(float128 a, float128 b, float_status *status)
7818 {
7819     flag aSign, bSign;
7820 
7821     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7822               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7823          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7824               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7825        ) {
7826         if (float128_is_signaling_nan(a, status)
7827          || float128_is_signaling_nan(b, status)) {
7828             float_raise(float_flag_invalid, status);
7829         }
7830         return 0;
7831     }
7832     aSign = extractFloat128Sign( a );
7833     bSign = extractFloat128Sign( b );
7834     if ( aSign != bSign ) {
7835         return
7836                aSign
7837             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7838                  == 0 );
7839     }
7840     return
7841           aSign ? le128( b.high, b.low, a.high, a.low )
7842         : le128( a.high, a.low, b.high, b.low );
7843 
7844 }
7845 
7846 /*----------------------------------------------------------------------------
7847 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7848 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7849 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7850 | Standard for Binary Floating-Point Arithmetic.
7851 *----------------------------------------------------------------------------*/
7852 
7853 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7854 {
7855     flag aSign, bSign;
7856 
7857     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7858               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7859          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7860               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7861        ) {
7862         if (float128_is_signaling_nan(a, status)
7863          || float128_is_signaling_nan(b, status)) {
7864             float_raise(float_flag_invalid, status);
7865         }
7866         return 0;
7867     }
7868     aSign = extractFloat128Sign( a );
7869     bSign = extractFloat128Sign( b );
7870     if ( aSign != bSign ) {
7871         return
7872                aSign
7873             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7874                  != 0 );
7875     }
7876     return
7877           aSign ? lt128( b.high, b.low, a.high, a.low )
7878         : lt128( a.high, a.low, b.high, b.low );
7879 
7880 }
7881 
7882 /*----------------------------------------------------------------------------
7883 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7884 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7885 | comparison is performed according to the IEC/IEEE Standard for Binary
7886 | Floating-Point Arithmetic.
7887 *----------------------------------------------------------------------------*/
7888 
7889 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7890 {
7891     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7892               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7893          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7894               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7895        ) {
7896         if (float128_is_signaling_nan(a, status)
7897          || float128_is_signaling_nan(b, status)) {
7898             float_raise(float_flag_invalid, status);
7899         }
7900         return 1;
7901     }
7902     return 0;
7903 }
7904 
7905 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7906                                             int is_quiet, float_status *status)
7907 {
7908     flag aSign, bSign;
7909 
7910     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7911         float_raise(float_flag_invalid, status);
7912         return float_relation_unordered;
7913     }
7914     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7915           ( extractFloatx80Frac( a )<<1 ) ) ||
7916         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7917           ( extractFloatx80Frac( b )<<1 ) )) {
7918         if (!is_quiet ||
7919             floatx80_is_signaling_nan(a, status) ||
7920             floatx80_is_signaling_nan(b, status)) {
7921             float_raise(float_flag_invalid, status);
7922         }
7923         return float_relation_unordered;
7924     }
7925     aSign = extractFloatx80Sign( a );
7926     bSign = extractFloatx80Sign( b );
7927     if ( aSign != bSign ) {
7928 
7929         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7930              ( ( a.low | b.low ) == 0 ) ) {
7931             /* zero case */
7932             return float_relation_equal;
7933         } else {
7934             return 1 - (2 * aSign);
7935         }
7936     } else {
7937         if (a.low == b.low && a.high == b.high) {
7938             return float_relation_equal;
7939         } else {
7940             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7941         }
7942     }
7943 }
7944 
7945 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7946 {
7947     return floatx80_compare_internal(a, b, 0, status);
7948 }
7949 
7950 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7951 {
7952     return floatx80_compare_internal(a, b, 1, status);
7953 }
7954 
7955 static inline int float128_compare_internal(float128 a, float128 b,
7956                                             int is_quiet, float_status *status)
7957 {
7958     flag aSign, bSign;
7959 
7960     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7961           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7962         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7963           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7964         if (!is_quiet ||
7965             float128_is_signaling_nan(a, status) ||
7966             float128_is_signaling_nan(b, status)) {
7967             float_raise(float_flag_invalid, status);
7968         }
7969         return float_relation_unordered;
7970     }
7971     aSign = extractFloat128Sign( a );
7972     bSign = extractFloat128Sign( b );
7973     if ( aSign != bSign ) {
7974         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7975             /* zero case */
7976             return float_relation_equal;
7977         } else {
7978             return 1 - (2 * aSign);
7979         }
7980     } else {
7981         if (a.low == b.low && a.high == b.high) {
7982             return float_relation_equal;
7983         } else {
7984             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7985         }
7986     }
7987 }
7988 
7989 int float128_compare(float128 a, float128 b, float_status *status)
7990 {
7991     return float128_compare_internal(a, b, 0, status);
7992 }
7993 
7994 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7995 {
7996     return float128_compare_internal(a, b, 1, status);
7997 }
7998 
7999 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
8000 {
8001     flag aSign;
8002     int32_t aExp;
8003     uint64_t aSig;
8004 
8005     if (floatx80_invalid_encoding(a)) {
8006         float_raise(float_flag_invalid, status);
8007         return floatx80_default_nan(status);
8008     }
8009     aSig = extractFloatx80Frac( a );
8010     aExp = extractFloatx80Exp( a );
8011     aSign = extractFloatx80Sign( a );
8012 
8013     if ( aExp == 0x7FFF ) {
8014         if ( aSig<<1 ) {
8015             return propagateFloatx80NaN(a, a, status);
8016         }
8017         return a;
8018     }
8019 
8020     if (aExp == 0) {
8021         if (aSig == 0) {
8022             return a;
8023         }
8024         aExp++;
8025     }
8026 
8027     if (n > 0x10000) {
8028         n = 0x10000;
8029     } else if (n < -0x10000) {
8030         n = -0x10000;
8031     }
8032 
8033     aExp += n;
8034     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
8035                                          aSign, aExp, aSig, 0, status);
8036 }
8037 
8038 float128 float128_scalbn(float128 a, int n, float_status *status)
8039 {
8040     flag aSign;
8041     int32_t aExp;
8042     uint64_t aSig0, aSig1;
8043 
8044     aSig1 = extractFloat128Frac1( a );
8045     aSig0 = extractFloat128Frac0( a );
8046     aExp = extractFloat128Exp( a );
8047     aSign = extractFloat128Sign( a );
8048     if ( aExp == 0x7FFF ) {
8049         if ( aSig0 | aSig1 ) {
8050             return propagateFloat128NaN(a, a, status);
8051         }
8052         return a;
8053     }
8054     if (aExp != 0) {
8055         aSig0 |= LIT64( 0x0001000000000000 );
8056     } else if (aSig0 == 0 && aSig1 == 0) {
8057         return a;
8058     } else {
8059         aExp++;
8060     }
8061 
8062     if (n > 0x10000) {
8063         n = 0x10000;
8064     } else if (n < -0x10000) {
8065         n = -0x10000;
8066     }
8067 
8068     aExp += n - 1;
8069     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
8070                                          , status);
8071 
8072 }
8073 
8074 static void __attribute__((constructor)) softfloat_init(void)
8075 {
8076     union_float64 ua, ub, uc, ur;
8077 
8078     if (QEMU_NO_HARDFLOAT) {
8079         return;
8080     }
8081     /*
8082      * Test that the host's FMA is not obviously broken. For example,
8083      * glibc < 2.23 can perform an incorrect FMA on certain hosts; see
8084      *   https://sourceware.org/bugzilla/show_bug.cgi?id=13304
8085      */
8086     ua.s = 0x0020000000000001ULL;
8087     ub.s = 0x3ca0000000000000ULL;
8088     uc.s = 0x0020000000000000ULL;
8089     ur.h = fma(ua.h, ub.h, uc.h);
8090     if (ur.s != 0x0020000000000001ULL) {
8091         force_soft_fma = true;
8092     }
8093 }
8094