xref: /openbmc/qemu/fpu/softfloat.c (revision 39164c13)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input.  If `zSign' is 1, the input is negated before being converted to an
139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer.  However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145 
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148     int8_t roundingMode;
149     flag roundNearestEven;
150     int8_t roundIncrement, roundBits;
151     int32_t z;
152 
153     roundingMode = status->float_rounding_mode;
154     roundNearestEven = ( roundingMode == float_round_nearest_even );
155     switch (roundingMode) {
156     case float_round_nearest_even:
157     case float_round_ties_away:
158         roundIncrement = 0x40;
159         break;
160     case float_round_to_zero:
161         roundIncrement = 0;
162         break;
163     case float_round_up:
164         roundIncrement = zSign ? 0 : 0x7f;
165         break;
166     case float_round_down:
167         roundIncrement = zSign ? 0x7f : 0;
168         break;
169     default:
170         abort();
171     }
172     roundBits = absZ & 0x7F;
173     absZ = ( absZ + roundIncrement )>>7;
174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175     z = absZ;
176     if ( zSign ) z = - z;
177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178         float_raise(float_flag_invalid, status);
179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180     }
181     if (roundBits) {
182         status->float_exception_flags |= float_flag_inexact;
183     }
184     return z;
185 
186 }
187 
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer.  However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201                                float_status *status)
202 {
203     int8_t roundingMode;
204     flag roundNearestEven, increment;
205     int64_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         increment = ((int64_t) absZ1 < 0);
213         break;
214     case float_round_to_zero:
215         increment = 0;
216         break;
217     case float_round_up:
218         increment = !zSign && absZ1;
219         break;
220     case float_round_down:
221         increment = zSign && absZ1;
222         break;
223     default:
224         abort();
225     }
226     if ( increment ) {
227         ++absZ0;
228         if ( absZ0 == 0 ) goto overflow;
229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230     }
231     z = absZ0;
232     if ( zSign ) z = - z;
233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
234  overflow:
235         float_raise(float_flag_invalid, status);
236         return
237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
238             : LIT64( 0x7FFFFFFFFFFFFFFF );
239     }
240     if (absZ1) {
241         status->float_exception_flags |= float_flag_inexact;
242     }
243     return z;
244 
245 }
246 
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer.  However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256 
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258                                 uint64_t absZ1, float_status *status)
259 {
260     int8_t roundingMode;
261     flag roundNearestEven, increment;
262 
263     roundingMode = status->float_rounding_mode;
264     roundNearestEven = (roundingMode == float_round_nearest_even);
265     switch (roundingMode) {
266     case float_round_nearest_even:
267     case float_round_ties_away:
268         increment = ((int64_t)absZ1 < 0);
269         break;
270     case float_round_to_zero:
271         increment = 0;
272         break;
273     case float_round_up:
274         increment = !zSign && absZ1;
275         break;
276     case float_round_down:
277         increment = zSign && absZ1;
278         break;
279     default:
280         abort();
281     }
282     if (increment) {
283         ++absZ0;
284         if (absZ0 == 0) {
285             float_raise(float_flag_invalid, status);
286             return LIT64(0xFFFFFFFFFFFFFFFF);
287         }
288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289     }
290 
291     if (zSign && absZ0) {
292         float_raise(float_flag_invalid, status);
293         return 0;
294     }
295 
296     if (absZ1) {
297         status->float_exception_flags |= float_flag_inexact;
298     }
299     return absZ0;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305 
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308 
309     return float32_val(a) & 0x007FFFFF;
310 
311 }
312 
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316 
317 static inline int extractFloat32Exp(float32 a)
318 {
319 
320     return ( float32_val(a)>>23 ) & 0xFF;
321 
322 }
323 
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327 
328 static inline flag extractFloat32Sign( float32 a )
329 {
330 
331     return float32_val(a)>>31;
332 
333 }
334 
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341     if (status->flush_inputs_to_zero) {
342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343             float_raise(float_flag_input_denormal, status);
344             return make_float32(float32_val(a) & 0x80000000);
345         }
346     }
347     return a;
348 }
349 
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'.  The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356 
357 static void
358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360     int8_t shiftCount;
361 
362     shiftCount = countLeadingZeros32( aSig ) - 8;
363     *zSigPtr = aSig<<shiftCount;
364     *zExpPtr = 1 - shiftCount;
365 
366 }
367 
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result.  After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result.  This means that any integer portion of `zSig'
373 | will be added into the exponent.  Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378 
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381 
382     return make_float32(
383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 
385 }
386 
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input.  Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly.  However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned.  If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 |     The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location.  This shifted
401 | significand must be normalized or smaller.  If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding.  In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408 
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410                                    float_status *status)
411 {
412     int8_t roundingMode;
413     flag roundNearestEven;
414     int8_t roundIncrement, roundBits;
415     flag isTiny;
416 
417     roundingMode = status->float_rounding_mode;
418     roundNearestEven = ( roundingMode == float_round_nearest_even );
419     switch (roundingMode) {
420     case float_round_nearest_even:
421     case float_round_ties_away:
422         roundIncrement = 0x40;
423         break;
424     case float_round_to_zero:
425         roundIncrement = 0;
426         break;
427     case float_round_up:
428         roundIncrement = zSign ? 0 : 0x7f;
429         break;
430     case float_round_down:
431         roundIncrement = zSign ? 0x7f : 0;
432         break;
433     default:
434         abort();
435         break;
436     }
437     roundBits = zSig & 0x7F;
438     if ( 0xFD <= (uint16_t) zExp ) {
439         if (    ( 0xFD < zExp )
440              || (    ( zExp == 0xFD )
441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442            ) {
443             float_raise(float_flag_overflow | float_flag_inexact, status);
444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445         }
446         if ( zExp < 0 ) {
447             if (status->flush_to_zero) {
448                 float_raise(float_flag_output_denormal, status);
449                 return packFloat32(zSign, 0, 0);
450             }
451             isTiny =
452                 (status->float_detect_tininess
453                  == float_tininess_before_rounding)
454                 || ( zExp < -1 )
455                 || ( zSig + roundIncrement < 0x80000000 );
456             shift32RightJamming( zSig, - zExp, &zSig );
457             zExp = 0;
458             roundBits = zSig & 0x7F;
459             if (isTiny && roundBits) {
460                 float_raise(float_flag_underflow, status);
461             }
462         }
463     }
464     if (roundBits) {
465         status->float_exception_flags |= float_flag_inexact;
466     }
467     zSig = ( zSig + roundIncrement )>>7;
468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469     if ( zSig == 0 ) zExp = 0;
470     return packFloat32( zSign, zExp, zSig );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input.  This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482 
483 static float32
484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485                               float_status *status)
486 {
487     int8_t shiftCount;
488 
489     shiftCount = countLeadingZeros32( zSig ) - 1;
490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491                                status);
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498 
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501 
502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503 
504 }
505 
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509 
510 static inline int extractFloat64Exp(float64 a)
511 {
512 
513     return ( float64_val(a)>>52 ) & 0x7FF;
514 
515 }
516 
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520 
521 static inline flag extractFloat64Sign( float64 a )
522 {
523 
524     return float64_val(a)>>63;
525 
526 }
527 
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534     if (status->flush_inputs_to_zero) {
535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536             float_raise(float_flag_input_denormal, status);
537             return make_float64(float64_val(a) & (1ULL << 63));
538         }
539     }
540     return a;
541 }
542 
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'.  The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549 
550 static void
551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553     int8_t shiftCount;
554 
555     shiftCount = countLeadingZeros64( aSig ) - 11;
556     *zSigPtr = aSig<<shiftCount;
557     *zExpPtr = 1 - shiftCount;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result.  After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result.  This means that any integer portion of `zSig'
566 | will be added into the exponent.  Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571 
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574 
575     return make_float64(
576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input.  Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly.  However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned.  If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 |     The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location.  This shifted
594 | significand must be normalized or smaller.  If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding.  In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601 
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603                                    float_status *status)
604 {
605     int8_t roundingMode;
606     flag roundNearestEven;
607     int roundIncrement, roundBits;
608     flag isTiny;
609 
610     roundingMode = status->float_rounding_mode;
611     roundNearestEven = ( roundingMode == float_round_nearest_even );
612     switch (roundingMode) {
613     case float_round_nearest_even:
614     case float_round_ties_away:
615         roundIncrement = 0x200;
616         break;
617     case float_round_to_zero:
618         roundIncrement = 0;
619         break;
620     case float_round_up:
621         roundIncrement = zSign ? 0 : 0x3ff;
622         break;
623     case float_round_down:
624         roundIncrement = zSign ? 0x3ff : 0;
625         break;
626     case float_round_to_odd:
627         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
628         break;
629     default:
630         abort();
631     }
632     roundBits = zSig & 0x3FF;
633     if ( 0x7FD <= (uint16_t) zExp ) {
634         if (    ( 0x7FD < zExp )
635              || (    ( zExp == 0x7FD )
636                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
637            ) {
638             bool overflow_to_inf = roundingMode != float_round_to_odd &&
639                                    roundIncrement != 0;
640             float_raise(float_flag_overflow | float_flag_inexact, status);
641             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
642         }
643         if ( zExp < 0 ) {
644             if (status->flush_to_zero) {
645                 float_raise(float_flag_output_denormal, status);
646                 return packFloat64(zSign, 0, 0);
647             }
648             isTiny =
649                    (status->float_detect_tininess
650                     == float_tininess_before_rounding)
651                 || ( zExp < -1 )
652                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
653             shift64RightJamming( zSig, - zExp, &zSig );
654             zExp = 0;
655             roundBits = zSig & 0x3FF;
656             if (isTiny && roundBits) {
657                 float_raise(float_flag_underflow, status);
658             }
659             if (roundingMode == float_round_to_odd) {
660                 /*
661                  * For round-to-odd case, the roundIncrement depends on
662                  * zSig which just changed.
663                  */
664                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
665             }
666         }
667     }
668     if (roundBits) {
669         status->float_exception_flags |= float_flag_inexact;
670     }
671     zSig = ( zSig + roundIncrement )>>10;
672     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
673     if ( zSig == 0 ) zExp = 0;
674     return packFloat64( zSign, zExp, zSig );
675 
676 }
677 
678 /*----------------------------------------------------------------------------
679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 | and significand `zSig', and returns the proper double-precision floating-
681 | point value corresponding to the abstract input.  This routine is just like
682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684 | floating-point exponent.
685 *----------------------------------------------------------------------------*/
686 
687 static float64
688  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
689                               float_status *status)
690 {
691     int8_t shiftCount;
692 
693     shiftCount = countLeadingZeros64( zSig ) - 1;
694     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
695                                status);
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the fraction bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline uint64_t extractFloatx80Frac( floatx80 a )
705 {
706 
707     return a.low;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the exponent bits of the extended double-precision floating-point
713 | value `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline int32_t extractFloatx80Exp( floatx80 a )
717 {
718 
719     return a.high & 0x7FFF;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Returns the sign bit of the extended double-precision floating-point value
725 | `a'.
726 *----------------------------------------------------------------------------*/
727 
728 static inline flag extractFloatx80Sign( floatx80 a )
729 {
730 
731     return a.high>>15;
732 
733 }
734 
735 /*----------------------------------------------------------------------------
736 | Normalizes the subnormal extended double-precision floating-point value
737 | represented by the denormalized significand `aSig'.  The normalized exponent
738 | and significand are stored at the locations pointed to by `zExpPtr' and
739 | `zSigPtr', respectively.
740 *----------------------------------------------------------------------------*/
741 
742 static void
743  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
744 {
745     int8_t shiftCount;
746 
747     shiftCount = countLeadingZeros64( aSig );
748     *zSigPtr = aSig<<shiftCount;
749     *zExpPtr = 1 - shiftCount;
750 
751 }
752 
753 /*----------------------------------------------------------------------------
754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755 | extended double-precision floating-point value, returning the result.
756 *----------------------------------------------------------------------------*/
757 
758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
759 {
760     floatx80 z;
761 
762     z.low = zSig;
763     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
764     return z;
765 
766 }
767 
768 /*----------------------------------------------------------------------------
769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
771 | and returns the proper extended double-precision floating-point value
772 | corresponding to the abstract input.  Ordinarily, the abstract value is
773 | rounded and packed into the extended double-precision format, with the
774 | inexact exception raised if the abstract input cannot be represented
775 | exactly.  However, if the abstract value is too large, the overflow and
776 | inexact exceptions are raised and an infinity or maximal finite value is
777 | returned.  If the abstract value is too small, the input value is rounded to
778 | a subnormal number, and the underflow and inexact exceptions are raised if
779 | the abstract input cannot be represented exactly as a subnormal extended
780 | double-precision floating-point number.
781 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
782 | number of bits as single or double precision, respectively.  Otherwise, the
783 | result is rounded to the full precision of the extended double-precision
784 | format.
785 |     The input significand must be normalized or smaller.  If the input
786 | significand is not normalized, `zExp' must be 0; in that case, the result
787 | returned is a subnormal number, and it must not require rounding.  The
788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789 | Floating-Point Arithmetic.
790 *----------------------------------------------------------------------------*/
791 
792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
793                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
794                                      float_status *status)
795 {
796     int8_t roundingMode;
797     flag roundNearestEven, increment, isTiny;
798     int64_t roundIncrement, roundMask, roundBits;
799 
800     roundingMode = status->float_rounding_mode;
801     roundNearestEven = ( roundingMode == float_round_nearest_even );
802     if ( roundingPrecision == 80 ) goto precision80;
803     if ( roundingPrecision == 64 ) {
804         roundIncrement = LIT64( 0x0000000000000400 );
805         roundMask = LIT64( 0x00000000000007FF );
806     }
807     else if ( roundingPrecision == 32 ) {
808         roundIncrement = LIT64( 0x0000008000000000 );
809         roundMask = LIT64( 0x000000FFFFFFFFFF );
810     }
811     else {
812         goto precision80;
813     }
814     zSig0 |= ( zSig1 != 0 );
815     switch (roundingMode) {
816     case float_round_nearest_even:
817     case float_round_ties_away:
818         break;
819     case float_round_to_zero:
820         roundIncrement = 0;
821         break;
822     case float_round_up:
823         roundIncrement = zSign ? 0 : roundMask;
824         break;
825     case float_round_down:
826         roundIncrement = zSign ? roundMask : 0;
827         break;
828     default:
829         abort();
830     }
831     roundBits = zSig0 & roundMask;
832     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
833         if (    ( 0x7FFE < zExp )
834              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
835            ) {
836             goto overflow;
837         }
838         if ( zExp <= 0 ) {
839             if (status->flush_to_zero) {
840                 float_raise(float_flag_output_denormal, status);
841                 return packFloatx80(zSign, 0, 0);
842             }
843             isTiny =
844                    (status->float_detect_tininess
845                     == float_tininess_before_rounding)
846                 || ( zExp < 0 )
847                 || ( zSig0 <= zSig0 + roundIncrement );
848             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
849             zExp = 0;
850             roundBits = zSig0 & roundMask;
851             if (isTiny && roundBits) {
852                 float_raise(float_flag_underflow, status);
853             }
854             if (roundBits) {
855                 status->float_exception_flags |= float_flag_inexact;
856             }
857             zSig0 += roundIncrement;
858             if ( (int64_t) zSig0 < 0 ) zExp = 1;
859             roundIncrement = roundMask + 1;
860             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
861                 roundMask |= roundIncrement;
862             }
863             zSig0 &= ~ roundMask;
864             return packFloatx80( zSign, zExp, zSig0 );
865         }
866     }
867     if (roundBits) {
868         status->float_exception_flags |= float_flag_inexact;
869     }
870     zSig0 += roundIncrement;
871     if ( zSig0 < roundIncrement ) {
872         ++zExp;
873         zSig0 = LIT64( 0x8000000000000000 );
874     }
875     roundIncrement = roundMask + 1;
876     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
877         roundMask |= roundIncrement;
878     }
879     zSig0 &= ~ roundMask;
880     if ( zSig0 == 0 ) zExp = 0;
881     return packFloatx80( zSign, zExp, zSig0 );
882  precision80:
883     switch (roundingMode) {
884     case float_round_nearest_even:
885     case float_round_ties_away:
886         increment = ((int64_t)zSig1 < 0);
887         break;
888     case float_round_to_zero:
889         increment = 0;
890         break;
891     case float_round_up:
892         increment = !zSign && zSig1;
893         break;
894     case float_round_down:
895         increment = zSign && zSig1;
896         break;
897     default:
898         abort();
899     }
900     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
901         if (    ( 0x7FFE < zExp )
902              || (    ( zExp == 0x7FFE )
903                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
904                   && increment
905                 )
906            ) {
907             roundMask = 0;
908  overflow:
909             float_raise(float_flag_overflow | float_flag_inexact, status);
910             if (    ( roundingMode == float_round_to_zero )
911                  || ( zSign && ( roundingMode == float_round_up ) )
912                  || ( ! zSign && ( roundingMode == float_round_down ) )
913                ) {
914                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
915             }
916             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
917         }
918         if ( zExp <= 0 ) {
919             isTiny =
920                    (status->float_detect_tininess
921                     == float_tininess_before_rounding)
922                 || ( zExp < 0 )
923                 || ! increment
924                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
925             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
926             zExp = 0;
927             if (isTiny && zSig1) {
928                 float_raise(float_flag_underflow, status);
929             }
930             if (zSig1) {
931                 status->float_exception_flags |= float_flag_inexact;
932             }
933             switch (roundingMode) {
934             case float_round_nearest_even:
935             case float_round_ties_away:
936                 increment = ((int64_t)zSig1 < 0);
937                 break;
938             case float_round_to_zero:
939                 increment = 0;
940                 break;
941             case float_round_up:
942                 increment = !zSign && zSig1;
943                 break;
944             case float_round_down:
945                 increment = zSign && zSig1;
946                 break;
947             default:
948                 abort();
949             }
950             if ( increment ) {
951                 ++zSig0;
952                 zSig0 &=
953                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
954                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
955             }
956             return packFloatx80( zSign, zExp, zSig0 );
957         }
958     }
959     if (zSig1) {
960         status->float_exception_flags |= float_flag_inexact;
961     }
962     if ( increment ) {
963         ++zSig0;
964         if ( zSig0 == 0 ) {
965             ++zExp;
966             zSig0 = LIT64( 0x8000000000000000 );
967         }
968         else {
969             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
970         }
971     }
972     else {
973         if ( zSig0 == 0 ) zExp = 0;
974     }
975     return packFloatx80( zSign, zExp, zSig0 );
976 
977 }
978 
979 /*----------------------------------------------------------------------------
980 | Takes an abstract floating-point value having sign `zSign', exponent
981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982 | and returns the proper extended double-precision floating-point value
983 | corresponding to the abstract input.  This routine is just like
984 | `roundAndPackFloatx80' except that the input significand does not have to be
985 | normalized.
986 *----------------------------------------------------------------------------*/
987 
988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
989                                               flag zSign, int32_t zExp,
990                                               uint64_t zSig0, uint64_t zSig1,
991                                               float_status *status)
992 {
993     int8_t shiftCount;
994 
995     if ( zSig0 == 0 ) {
996         zSig0 = zSig1;
997         zSig1 = 0;
998         zExp -= 64;
999     }
1000     shiftCount = countLeadingZeros64( zSig0 );
1001     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002     zExp -= shiftCount;
1003     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004                                 zSig0, zSig1, status);
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac1( float128 a )
1014 {
1015 
1016     return a.low;
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline uint64_t extractFloat128Frac0( float128 a )
1026 {
1027 
1028     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1034 | `a'.
1035 *----------------------------------------------------------------------------*/
1036 
1037 static inline int32_t extractFloat128Exp( float128 a )
1038 {
1039 
1040     return ( a.high>>48 ) & 0x7FFF;
1041 
1042 }
1043 
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1047 
1048 static inline flag extractFloat128Sign( float128 a )
1049 {
1050 
1051     return a.high>>63;
1052 
1053 }
1054 
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1064 
1065 static void
1066  normalizeFloat128Subnormal(
1067      uint64_t aSig0,
1068      uint64_t aSig1,
1069      int32_t *zExpPtr,
1070      uint64_t *zSig0Ptr,
1071      uint64_t *zSig1Ptr
1072  )
1073 {
1074     int8_t shiftCount;
1075 
1076     if ( aSig0 == 0 ) {
1077         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078         if ( shiftCount < 0 ) {
1079             *zSig0Ptr = aSig1>>( - shiftCount );
1080             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081         }
1082         else {
1083             *zSig0Ptr = aSig1<<shiftCount;
1084             *zSig1Ptr = 0;
1085         }
1086         *zExpPtr = - shiftCount - 63;
1087     }
1088     else {
1089         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091         *zExpPtr = 1 - shiftCount;
1092     }
1093 
1094 }
1095 
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result.  After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result.  This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108 
1109 static inline float128
1110  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111 {
1112     float128 z;
1113 
1114     z.low = zSig1;
1115     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116     return z;
1117 
1118 }
1119 
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input.  Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly.  However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned.  If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 |     The input significand must be normalized or smaller.  If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding.  In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent.  The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140 
1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142                                      uint64_t zSig0, uint64_t zSig1,
1143                                      uint64_t zSig2, float_status *status)
1144 {
1145     int8_t roundingMode;
1146     flag roundNearestEven, increment, isTiny;
1147 
1148     roundingMode = status->float_rounding_mode;
1149     roundNearestEven = ( roundingMode == float_round_nearest_even );
1150     switch (roundingMode) {
1151     case float_round_nearest_even:
1152     case float_round_ties_away:
1153         increment = ((int64_t)zSig2 < 0);
1154         break;
1155     case float_round_to_zero:
1156         increment = 0;
1157         break;
1158     case float_round_up:
1159         increment = !zSign && zSig2;
1160         break;
1161     case float_round_down:
1162         increment = zSign && zSig2;
1163         break;
1164     case float_round_to_odd:
1165         increment = !(zSig1 & 0x1) && zSig2;
1166         break;
1167     default:
1168         abort();
1169     }
1170     if ( 0x7FFD <= (uint32_t) zExp ) {
1171         if (    ( 0x7FFD < zExp )
1172              || (    ( zExp == 0x7FFD )
1173                   && eq128(
1174                          LIT64( 0x0001FFFFFFFFFFFF ),
1175                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1176                          zSig0,
1177                          zSig1
1178                      )
1179                   && increment
1180                 )
1181            ) {
1182             float_raise(float_flag_overflow | float_flag_inexact, status);
1183             if (    ( roundingMode == float_round_to_zero )
1184                  || ( zSign && ( roundingMode == float_round_up ) )
1185                  || ( ! zSign && ( roundingMode == float_round_down ) )
1186                  || (roundingMode == float_round_to_odd)
1187                ) {
1188                 return
1189                     packFloat128(
1190                         zSign,
1191                         0x7FFE,
1192                         LIT64( 0x0000FFFFFFFFFFFF ),
1193                         LIT64( 0xFFFFFFFFFFFFFFFF )
1194                     );
1195             }
1196             return packFloat128( zSign, 0x7FFF, 0, 0 );
1197         }
1198         if ( zExp < 0 ) {
1199             if (status->flush_to_zero) {
1200                 float_raise(float_flag_output_denormal, status);
1201                 return packFloat128(zSign, 0, 0, 0);
1202             }
1203             isTiny =
1204                    (status->float_detect_tininess
1205                     == float_tininess_before_rounding)
1206                 || ( zExp < -1 )
1207                 || ! increment
1208                 || lt128(
1209                        zSig0,
1210                        zSig1,
1211                        LIT64( 0x0001FFFFFFFFFFFF ),
1212                        LIT64( 0xFFFFFFFFFFFFFFFF )
1213                    );
1214             shift128ExtraRightJamming(
1215                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216             zExp = 0;
1217             if (isTiny && zSig2) {
1218                 float_raise(float_flag_underflow, status);
1219             }
1220             switch (roundingMode) {
1221             case float_round_nearest_even:
1222             case float_round_ties_away:
1223                 increment = ((int64_t)zSig2 < 0);
1224                 break;
1225             case float_round_to_zero:
1226                 increment = 0;
1227                 break;
1228             case float_round_up:
1229                 increment = !zSign && zSig2;
1230                 break;
1231             case float_round_down:
1232                 increment = zSign && zSig2;
1233                 break;
1234             case float_round_to_odd:
1235                 increment = !(zSig1 & 0x1) && zSig2;
1236                 break;
1237             default:
1238                 abort();
1239             }
1240         }
1241     }
1242     if (zSig2) {
1243         status->float_exception_flags |= float_flag_inexact;
1244     }
1245     if ( increment ) {
1246         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248     }
1249     else {
1250         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251     }
1252     return packFloat128( zSign, zExp, zSig0, zSig1 );
1253 
1254 }
1255 
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1263 | point exponent.
1264 *----------------------------------------------------------------------------*/
1265 
1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267                                               uint64_t zSig0, uint64_t zSig1,
1268                                               float_status *status)
1269 {
1270     int8_t shiftCount;
1271     uint64_t zSig2;
1272 
1273     if ( zSig0 == 0 ) {
1274         zSig0 = zSig1;
1275         zSig1 = 0;
1276         zExp -= 64;
1277     }
1278     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279     if ( 0 <= shiftCount ) {
1280         zSig2 = 0;
1281         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282     }
1283     else {
1284         shift128ExtraRightJamming(
1285             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286     }
1287     zExp -= shiftCount;
1288     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289 
1290 }
1291 
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format.  The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1297 
1298 float32 int32_to_float32(int32_t a, float_status *status)
1299 {
1300     flag zSign;
1301 
1302     if ( a == 0 ) return float32_zero;
1303     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304     zSign = ( a < 0 );
1305     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306 }
1307 
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format.  The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1313 
1314 float64 int32_to_float64(int32_t a, float_status *status)
1315 {
1316     flag zSign;
1317     uint32_t absA;
1318     int8_t shiftCount;
1319     uint64_t zSig;
1320 
1321     if ( a == 0 ) return float64_zero;
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros32( absA ) + 21;
1325     zSig = absA;
1326     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327 
1328 }
1329 
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format.  The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic.
1335 *----------------------------------------------------------------------------*/
1336 
1337 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338 {
1339     flag zSign;
1340     uint32_t absA;
1341     int8_t shiftCount;
1342     uint64_t zSig;
1343 
1344     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345     zSign = ( a < 0 );
1346     absA = zSign ? - a : a;
1347     shiftCount = countLeadingZeros32( absA ) + 32;
1348     zSig = absA;
1349     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350 
1351 }
1352 
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format.  The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1358 
1359 float128 int32_to_float128(int32_t a, float_status *status)
1360 {
1361     flag zSign;
1362     uint32_t absA;
1363     int8_t shiftCount;
1364     uint64_t zSig0;
1365 
1366     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367     zSign = ( a < 0 );
1368     absA = zSign ? - a : a;
1369     shiftCount = countLeadingZeros32( absA ) + 17;
1370     zSig0 = absA;
1371     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372 
1373 }
1374 
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format.  The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380 
1381 float32 int64_to_float32(int64_t a, float_status *status)
1382 {
1383     flag zSign;
1384     uint64_t absA;
1385     int8_t shiftCount;
1386 
1387     if ( a == 0 ) return float32_zero;
1388     zSign = ( a < 0 );
1389     absA = zSign ? - a : a;
1390     shiftCount = countLeadingZeros64( absA ) - 40;
1391     if ( 0 <= shiftCount ) {
1392         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393     }
1394     else {
1395         shiftCount += 7;
1396         if ( shiftCount < 0 ) {
1397             shift64RightJamming( absA, - shiftCount, &absA );
1398         }
1399         else {
1400             absA <<= shiftCount;
1401         }
1402         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403     }
1404 
1405 }
1406 
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format.  The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412 
1413 float64 int64_to_float64(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416 
1417     if ( a == 0 ) return float64_zero;
1418     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419         return packFloat64( 1, 0x43E, 0 );
1420     }
1421     zSign = ( a < 0 );
1422     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423 }
1424 
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format.  The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429 | Arithmetic.
1430 *----------------------------------------------------------------------------*/
1431 
1432 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433 {
1434     flag zSign;
1435     uint64_t absA;
1436     int8_t shiftCount;
1437 
1438     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439     zSign = ( a < 0 );
1440     absA = zSign ? - a : a;
1441     shiftCount = countLeadingZeros64( absA );
1442     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443 
1444 }
1445 
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format.  The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1451 
1452 float128 int64_to_float128(int64_t a, float_status *status)
1453 {
1454     flag zSign;
1455     uint64_t absA;
1456     int8_t shiftCount;
1457     int32_t zExp;
1458     uint64_t zSig0, zSig1;
1459 
1460     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461     zSign = ( a < 0 );
1462     absA = zSign ? - a : a;
1463     shiftCount = countLeadingZeros64( absA ) + 49;
1464     zExp = 0x406E - shiftCount;
1465     if ( 64 <= shiftCount ) {
1466         zSig1 = 0;
1467         zSig0 = absA;
1468         shiftCount -= 64;
1469     }
1470     else {
1471         zSig1 = absA;
1472         zSig0 = 0;
1473     }
1474     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475     return packFloat128( zSign, zExp, zSig0, zSig1 );
1476 
1477 }
1478 
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format.  The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484 
1485 float32 uint64_to_float32(uint64_t a, float_status *status)
1486 {
1487     int shiftcount;
1488 
1489     if (a == 0) {
1490         return float32_zero;
1491     }
1492 
1493     /* Determine (left) shift needed to put first set bit into bit posn 23
1494      * (since packFloat32() expects the binary point between bits 23 and 22);
1495      * this is the fast case for smallish numbers.
1496      */
1497     shiftcount = countLeadingZeros64(a) - 40;
1498     if (shiftcount >= 0) {
1499         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500     }
1501     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502      * expects the binary point between bits 30 and 29, hence the + 7.
1503      */
1504     shiftcount += 7;
1505     if (shiftcount < 0) {
1506         shift64RightJamming(a, -shiftcount, &a);
1507     } else {
1508         a <<= shiftcount;
1509     }
1510 
1511     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512 }
1513 
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format.  The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1519 
1520 float64 uint64_to_float64(uint64_t a, float_status *status)
1521 {
1522     int exp = 0x43C;
1523     int shiftcount;
1524 
1525     if (a == 0) {
1526         return float64_zero;
1527     }
1528 
1529     shiftcount = countLeadingZeros64(a) - 1;
1530     if (shiftcount < 0) {
1531         shift64RightJamming(a, -shiftcount, &a);
1532     } else {
1533         a <<= shiftcount;
1534     }
1535     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536 }
1537 
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format.  The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543 
1544 float128 uint64_to_float128(uint64_t a, float_status *status)
1545 {
1546     if (a == 0) {
1547         return float128_zero;
1548     }
1549     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550 }
1551 
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format.  The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode.  If `a' is a NaN, the largest
1558 | positive integer is returned.  Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1561 
1562 int32_t float32_to_int32(float32 a, float_status *status)
1563 {
1564     flag aSign;
1565     int aExp;
1566     int shiftCount;
1567     uint32_t aSig;
1568     uint64_t aSig64;
1569 
1570     a = float32_squash_input_denormal(a, status);
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575     if ( aExp ) aSig |= 0x00800000;
1576     shiftCount = 0xAF - aExp;
1577     aSig64 = aSig;
1578     aSig64 <<= 32;
1579     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580     return roundAndPackInt32(aSign, aSig64, status);
1581 
1582 }
1583 
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format.  The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1591 | returned.
1592 *----------------------------------------------------------------------------*/
1593 
1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595 {
1596     flag aSign;
1597     int aExp;
1598     int shiftCount;
1599     uint32_t aSig;
1600     int32_t z;
1601     a = float32_squash_input_denormal(a, status);
1602 
1603     aSig = extractFloat32Frac( a );
1604     aExp = extractFloat32Exp( a );
1605     aSign = extractFloat32Sign( a );
1606     shiftCount = aExp - 0x9E;
1607     if ( 0 <= shiftCount ) {
1608         if ( float32_val(a) != 0xCF000000 ) {
1609             float_raise(float_flag_invalid, status);
1610             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611         }
1612         return (int32_t) 0x80000000;
1613     }
1614     else if ( aExp <= 0x7E ) {
1615         if (aExp | aSig) {
1616             status->float_exception_flags |= float_flag_inexact;
1617         }
1618         return 0;
1619     }
1620     aSig = ( aSig | 0x00800000 )<<8;
1621     z = aSig>>( - shiftCount );
1622     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623         status->float_exception_flags |= float_flag_inexact;
1624     }
1625     if ( aSign ) z = - z;
1626     return z;
1627 
1628 }
1629 
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format.  The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1637 | returned.
1638 *----------------------------------------------------------------------------*/
1639 
1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641 {
1642     flag aSign;
1643     int aExp;
1644     int shiftCount;
1645     uint32_t aSig;
1646     int32_t z;
1647 
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0x8E;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xC7000000 ) {
1654             float_raise(float_flag_invalid, status);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return 0x7FFF;
1657             }
1658         }
1659         return (int32_t) 0xffff8000;
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) {
1663             status->float_exception_flags |= float_flag_inexact;
1664         }
1665         return 0;
1666     }
1667     shiftCount -= 0x10;
1668     aSig = ( aSig | 0x00800000 )<<8;
1669     z = aSig>>( - shiftCount );
1670     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671         status->float_exception_flags |= float_flag_inexact;
1672     }
1673     if ( aSign ) {
1674         z = - z;
1675     }
1676     return z;
1677 
1678 }
1679 
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format.  The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode.  If `a' is a NaN, the largest
1686 | positive integer is returned.  Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1689 
1690 int64_t float32_to_int64(float32 a, float_status *status)
1691 {
1692     flag aSign;
1693     int aExp;
1694     int shiftCount;
1695     uint32_t aSig;
1696     uint64_t aSig64, aSigExtra;
1697     a = float32_squash_input_denormal(a, status);
1698 
1699     aSig = extractFloat32Frac( a );
1700     aExp = extractFloat32Exp( a );
1701     aSign = extractFloat32Sign( a );
1702     shiftCount = 0xBE - aExp;
1703     if ( shiftCount < 0 ) {
1704         float_raise(float_flag_invalid, status);
1705         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706             return LIT64( 0x7FFFFFFFFFFFFFFF );
1707         }
1708         return (int64_t) LIT64( 0x8000000000000000 );
1709     }
1710     if ( aExp ) aSig |= 0x00800000;
1711     aSig64 = aSig;
1712     aSig64 <<= 40;
1713     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715 
1716 }
1717 
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format.  The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode.  If `a' is a NaN, the largest
1724 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned.  If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1729 
1730 uint64_t float32_to_uint64(float32 a, float_status *status)
1731 {
1732     flag aSign;
1733     int aExp;
1734     int shiftCount;
1735     uint32_t aSig;
1736     uint64_t aSig64, aSigExtra;
1737     a = float32_squash_input_denormal(a, status);
1738 
1739     aSig = extractFloat32Frac(a);
1740     aExp = extractFloat32Exp(a);
1741     aSign = extractFloat32Sign(a);
1742     if ((aSign) && (aExp > 126)) {
1743         float_raise(float_flag_invalid, status);
1744         if (float32_is_any_nan(a)) {
1745             return LIT64(0xFFFFFFFFFFFFFFFF);
1746         } else {
1747             return 0;
1748         }
1749     }
1750     shiftCount = 0xBE - aExp;
1751     if (aExp) {
1752         aSig |= 0x00800000;
1753     }
1754     if (shiftCount < 0) {
1755         float_raise(float_flag_invalid, status);
1756         return LIT64(0xFFFFFFFFFFFFFFFF);
1757     }
1758 
1759     aSig64 = aSig;
1760     aSig64 <<= 40;
1761     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763 }
1764 
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format.  The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero.  If
1770 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned.  If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1775 
1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777 {
1778     signed char current_rounding_mode = status->float_rounding_mode;
1779     set_float_rounding_mode(float_round_to_zero, status);
1780     int64_t v = float32_to_uint64(a, status);
1781     set_float_rounding_mode(current_rounding_mode, status);
1782     return v;
1783 }
1784 
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format.  The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero.  If
1790 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1792 | returned.
1793 *----------------------------------------------------------------------------*/
1794 
1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796 {
1797     flag aSign;
1798     int aExp;
1799     int shiftCount;
1800     uint32_t aSig;
1801     uint64_t aSig64;
1802     int64_t z;
1803     a = float32_squash_input_denormal(a, status);
1804 
1805     aSig = extractFloat32Frac( a );
1806     aExp = extractFloat32Exp( a );
1807     aSign = extractFloat32Sign( a );
1808     shiftCount = aExp - 0xBE;
1809     if ( 0 <= shiftCount ) {
1810         if ( float32_val(a) != 0xDF000000 ) {
1811             float_raise(float_flag_invalid, status);
1812             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814             }
1815         }
1816         return (int64_t) LIT64( 0x8000000000000000 );
1817     }
1818     else if ( aExp <= 0x7E ) {
1819         if (aExp | aSig) {
1820             status->float_exception_flags |= float_flag_inexact;
1821         }
1822         return 0;
1823     }
1824     aSig64 = aSig | 0x00800000;
1825     aSig64 <<= 40;
1826     z = aSig64>>( - shiftCount );
1827     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828         status->float_exception_flags |= float_flag_inexact;
1829     }
1830     if ( aSign ) z = - z;
1831     return z;
1832 
1833 }
1834 
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format.  The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1839 | Arithmetic.
1840 *----------------------------------------------------------------------------*/
1841 
1842 float64 float32_to_float64(float32 a, float_status *status)
1843 {
1844     flag aSign;
1845     int aExp;
1846     uint32_t aSig;
1847     a = float32_squash_input_denormal(a, status);
1848 
1849     aSig = extractFloat32Frac( a );
1850     aExp = extractFloat32Exp( a );
1851     aSign = extractFloat32Sign( a );
1852     if ( aExp == 0xFF ) {
1853         if (aSig) {
1854             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855         }
1856         return packFloat64( aSign, 0x7FF, 0 );
1857     }
1858     if ( aExp == 0 ) {
1859         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861         --aExp;
1862     }
1863     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864 
1865 }
1866 
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format.  The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873 
1874 floatx80 float32_to_floatx80(float32 a, float_status *status)
1875 {
1876     flag aSign;
1877     int aExp;
1878     uint32_t aSig;
1879 
1880     a = float32_squash_input_denormal(a, status);
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     aSign = extractFloat32Sign( a );
1884     if ( aExp == 0xFF ) {
1885         if (aSig) {
1886             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887         }
1888         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889     }
1890     if ( aExp == 0 ) {
1891         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893     }
1894     aSig |= 0x00800000;
1895     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896 
1897 }
1898 
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format.  The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1903 | Arithmetic.
1904 *----------------------------------------------------------------------------*/
1905 
1906 float128 float32_to_float128(float32 a, float_status *status)
1907 {
1908     flag aSign;
1909     int aExp;
1910     uint32_t aSig;
1911 
1912     a = float32_squash_input_denormal(a, status);
1913     aSig = extractFloat32Frac( a );
1914     aExp = extractFloat32Exp( a );
1915     aSign = extractFloat32Sign( a );
1916     if ( aExp == 0xFF ) {
1917         if (aSig) {
1918             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919         }
1920         return packFloat128( aSign, 0x7FFF, 0, 0 );
1921     }
1922     if ( aExp == 0 ) {
1923         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925         --aExp;
1926     }
1927     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928 
1929 }
1930 
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value.  The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937 
1938 float32 float32_round_to_int(float32 a, float_status *status)
1939 {
1940     flag aSign;
1941     int aExp;
1942     uint32_t lastBitMask, roundBitsMask;
1943     uint32_t z;
1944     a = float32_squash_input_denormal(a, status);
1945 
1946     aExp = extractFloat32Exp( a );
1947     if ( 0x96 <= aExp ) {
1948         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949             return propagateFloat32NaN(a, a, status);
1950         }
1951         return a;
1952     }
1953     if ( aExp <= 0x7E ) {
1954         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955         status->float_exception_flags |= float_flag_inexact;
1956         aSign = extractFloat32Sign( a );
1957         switch (status->float_rounding_mode) {
1958          case float_round_nearest_even:
1959             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960                 return packFloat32( aSign, 0x7F, 0 );
1961             }
1962             break;
1963         case float_round_ties_away:
1964             if (aExp == 0x7E) {
1965                 return packFloat32(aSign, 0x7F, 0);
1966             }
1967             break;
1968          case float_round_down:
1969             return make_float32(aSign ? 0xBF800000 : 0);
1970          case float_round_up:
1971             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972         }
1973         return packFloat32( aSign, 0, 0 );
1974     }
1975     lastBitMask = 1;
1976     lastBitMask <<= 0x96 - aExp;
1977     roundBitsMask = lastBitMask - 1;
1978     z = float32_val(a);
1979     switch (status->float_rounding_mode) {
1980     case float_round_nearest_even:
1981         z += lastBitMask>>1;
1982         if ((z & roundBitsMask) == 0) {
1983             z &= ~lastBitMask;
1984         }
1985         break;
1986     case float_round_ties_away:
1987         z += lastBitMask >> 1;
1988         break;
1989     case float_round_to_zero:
1990         break;
1991     case float_round_up:
1992         if (!extractFloat32Sign(make_float32(z))) {
1993             z += roundBitsMask;
1994         }
1995         break;
1996     case float_round_down:
1997         if (extractFloat32Sign(make_float32(z))) {
1998             z += roundBitsMask;
1999         }
2000         break;
2001     default:
2002         abort();
2003     }
2004     z &= ~ roundBitsMask;
2005     if (z != float32_val(a)) {
2006         status->float_exception_flags |= float_flag_inexact;
2007     }
2008     return make_float32(z);
2009 
2010 }
2011 
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2015 | before being returned.  `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019 
2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021                               float_status *status)
2022 {
2023     int aExp, bExp, zExp;
2024     uint32_t aSig, bSig, zSig;
2025     int expDiff;
2026 
2027     aSig = extractFloat32Frac( a );
2028     aExp = extractFloat32Exp( a );
2029     bSig = extractFloat32Frac( b );
2030     bExp = extractFloat32Exp( b );
2031     expDiff = aExp - bExp;
2032     aSig <<= 6;
2033     bSig <<= 6;
2034     if ( 0 < expDiff ) {
2035         if ( aExp == 0xFF ) {
2036             if (aSig) {
2037                 return propagateFloat32NaN(a, b, status);
2038             }
2039             return a;
2040         }
2041         if ( bExp == 0 ) {
2042             --expDiff;
2043         }
2044         else {
2045             bSig |= 0x20000000;
2046         }
2047         shift32RightJamming( bSig, expDiff, &bSig );
2048         zExp = aExp;
2049     }
2050     else if ( expDiff < 0 ) {
2051         if ( bExp == 0xFF ) {
2052             if (bSig) {
2053                 return propagateFloat32NaN(a, b, status);
2054             }
2055             return packFloat32( zSign, 0xFF, 0 );
2056         }
2057         if ( aExp == 0 ) {
2058             ++expDiff;
2059         }
2060         else {
2061             aSig |= 0x20000000;
2062         }
2063         shift32RightJamming( aSig, - expDiff, &aSig );
2064         zExp = bExp;
2065     }
2066     else {
2067         if ( aExp == 0xFF ) {
2068             if (aSig | bSig) {
2069                 return propagateFloat32NaN(a, b, status);
2070             }
2071             return a;
2072         }
2073         if ( aExp == 0 ) {
2074             if (status->flush_to_zero) {
2075                 if (aSig | bSig) {
2076                     float_raise(float_flag_output_denormal, status);
2077                 }
2078                 return packFloat32(zSign, 0, 0);
2079             }
2080             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081         }
2082         zSig = 0x40000000 + aSig + bSig;
2083         zExp = aExp;
2084         goto roundAndPack;
2085     }
2086     aSig |= 0x20000000;
2087     zSig = ( aSig + bSig )<<1;
2088     --zExp;
2089     if ( (int32_t) zSig < 0 ) {
2090         zSig = aSig + bSig;
2091         ++zExp;
2092     }
2093  roundAndPack:
2094     return roundAndPackFloat32(zSign, zExp, zSig, status);
2095 
2096 }
2097 
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2101 | difference is negated before being returned.  `zSign' is ignored if the
2102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105 
2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107                               float_status *status)
2108 {
2109     int aExp, bExp, zExp;
2110     uint32_t aSig, bSig, zSig;
2111     int expDiff;
2112 
2113     aSig = extractFloat32Frac( a );
2114     aExp = extractFloat32Exp( a );
2115     bSig = extractFloat32Frac( b );
2116     bExp = extractFloat32Exp( b );
2117     expDiff = aExp - bExp;
2118     aSig <<= 7;
2119     bSig <<= 7;
2120     if ( 0 < expDiff ) goto aExpBigger;
2121     if ( expDiff < 0 ) goto bExpBigger;
2122     if ( aExp == 0xFF ) {
2123         if (aSig | bSig) {
2124             return propagateFloat32NaN(a, b, status);
2125         }
2126         float_raise(float_flag_invalid, status);
2127         return float32_default_nan(status);
2128     }
2129     if ( aExp == 0 ) {
2130         aExp = 1;
2131         bExp = 1;
2132     }
2133     if ( bSig < aSig ) goto aBigger;
2134     if ( aSig < bSig ) goto bBigger;
2135     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136  bExpBigger:
2137     if ( bExp == 0xFF ) {
2138         if (bSig) {
2139             return propagateFloat32NaN(a, b, status);
2140         }
2141         return packFloat32( zSign ^ 1, 0xFF, 0 );
2142     }
2143     if ( aExp == 0 ) {
2144         ++expDiff;
2145     }
2146     else {
2147         aSig |= 0x40000000;
2148     }
2149     shift32RightJamming( aSig, - expDiff, &aSig );
2150     bSig |= 0x40000000;
2151  bBigger:
2152     zSig = bSig - aSig;
2153     zExp = bExp;
2154     zSign ^= 1;
2155     goto normalizeRoundAndPack;
2156  aExpBigger:
2157     if ( aExp == 0xFF ) {
2158         if (aSig) {
2159             return propagateFloat32NaN(a, b, status);
2160         }
2161         return a;
2162     }
2163     if ( bExp == 0 ) {
2164         --expDiff;
2165     }
2166     else {
2167         bSig |= 0x40000000;
2168     }
2169     shift32RightJamming( bSig, expDiff, &bSig );
2170     aSig |= 0x40000000;
2171  aBigger:
2172     zSig = aSig - bSig;
2173     zExp = aExp;
2174  normalizeRoundAndPack:
2175     --zExp;
2176     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177 
2178 }
2179 
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185 
2186 float32 float32_add(float32 a, float32 b, float_status *status)
2187 {
2188     flag aSign, bSign;
2189     a = float32_squash_input_denormal(a, status);
2190     b = float32_squash_input_denormal(b, status);
2191 
2192     aSign = extractFloat32Sign( a );
2193     bSign = extractFloat32Sign( b );
2194     if ( aSign == bSign ) {
2195         return addFloat32Sigs(a, b, aSign, status);
2196     }
2197     else {
2198         return subFloat32Sigs(a, b, aSign, status);
2199     }
2200 
2201 }
2202 
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2208 
2209 float32 float32_sub(float32 a, float32 b, float_status *status)
2210 {
2211     flag aSign, bSign;
2212     a = float32_squash_input_denormal(a, status);
2213     b = float32_squash_input_denormal(b, status);
2214 
2215     aSign = extractFloat32Sign( a );
2216     bSign = extractFloat32Sign( b );
2217     if ( aSign == bSign ) {
2218         return subFloat32Sigs(a, b, aSign, status);
2219     }
2220     else {
2221         return addFloat32Sigs(a, b, aSign, status);
2222     }
2223 
2224 }
2225 
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2231 
2232 float32 float32_mul(float32 a, float32 b, float_status *status)
2233 {
2234     flag aSign, bSign, zSign;
2235     int aExp, bExp, zExp;
2236     uint32_t aSig, bSig;
2237     uint64_t zSig64;
2238     uint32_t zSig;
2239 
2240     a = float32_squash_input_denormal(a, status);
2241     b = float32_squash_input_denormal(b, status);
2242 
2243     aSig = extractFloat32Frac( a );
2244     aExp = extractFloat32Exp( a );
2245     aSign = extractFloat32Sign( a );
2246     bSig = extractFloat32Frac( b );
2247     bExp = extractFloat32Exp( b );
2248     bSign = extractFloat32Sign( b );
2249     zSign = aSign ^ bSign;
2250     if ( aExp == 0xFF ) {
2251         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252             return propagateFloat32NaN(a, b, status);
2253         }
2254         if ( ( bExp | bSig ) == 0 ) {
2255             float_raise(float_flag_invalid, status);
2256             return float32_default_nan(status);
2257         }
2258         return packFloat32( zSign, 0xFF, 0 );
2259     }
2260     if ( bExp == 0xFF ) {
2261         if (bSig) {
2262             return propagateFloat32NaN(a, b, status);
2263         }
2264         if ( ( aExp | aSig ) == 0 ) {
2265             float_raise(float_flag_invalid, status);
2266             return float32_default_nan(status);
2267         }
2268         return packFloat32( zSign, 0xFF, 0 );
2269     }
2270     if ( aExp == 0 ) {
2271         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273     }
2274     if ( bExp == 0 ) {
2275         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277     }
2278     zExp = aExp + bExp - 0x7F;
2279     aSig = ( aSig | 0x00800000 )<<7;
2280     bSig = ( bSig | 0x00800000 )<<8;
2281     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282     zSig = zSig64;
2283     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284         zSig <<= 1;
2285         --zExp;
2286     }
2287     return roundAndPackFloat32(zSign, zExp, zSig, status);
2288 
2289 }
2290 
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'.  The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2296 
2297 float32 float32_div(float32 a, float32 b, float_status *status)
2298 {
2299     flag aSign, bSign, zSign;
2300     int aExp, bExp, zExp;
2301     uint32_t aSig, bSig, zSig;
2302     a = float32_squash_input_denormal(a, status);
2303     b = float32_squash_input_denormal(b, status);
2304 
2305     aSig = extractFloat32Frac( a );
2306     aExp = extractFloat32Exp( a );
2307     aSign = extractFloat32Sign( a );
2308     bSig = extractFloat32Frac( b );
2309     bExp = extractFloat32Exp( b );
2310     bSign = extractFloat32Sign( b );
2311     zSign = aSign ^ bSign;
2312     if ( aExp == 0xFF ) {
2313         if (aSig) {
2314             return propagateFloat32NaN(a, b, status);
2315         }
2316         if ( bExp == 0xFF ) {
2317             if (bSig) {
2318                 return propagateFloat32NaN(a, b, status);
2319             }
2320             float_raise(float_flag_invalid, status);
2321             return float32_default_nan(status);
2322         }
2323         return packFloat32( zSign, 0xFF, 0 );
2324     }
2325     if ( bExp == 0xFF ) {
2326         if (bSig) {
2327             return propagateFloat32NaN(a, b, status);
2328         }
2329         return packFloat32( zSign, 0, 0 );
2330     }
2331     if ( bExp == 0 ) {
2332         if ( bSig == 0 ) {
2333             if ( ( aExp | aSig ) == 0 ) {
2334                 float_raise(float_flag_invalid, status);
2335                 return float32_default_nan(status);
2336             }
2337             float_raise(float_flag_divbyzero, status);
2338             return packFloat32( zSign, 0xFF, 0 );
2339         }
2340         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341     }
2342     if ( aExp == 0 ) {
2343         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345     }
2346     zExp = aExp - bExp + 0x7D;
2347     aSig = ( aSig | 0x00800000 )<<7;
2348     bSig = ( bSig | 0x00800000 )<<8;
2349     if ( bSig <= ( aSig + aSig ) ) {
2350         aSig >>= 1;
2351         ++zExp;
2352     }
2353     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354     if ( ( zSig & 0x3F ) == 0 ) {
2355         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356     }
2357     return roundAndPackFloat32(zSign, zExp, zSig, status);
2358 
2359 }
2360 
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'.  The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366 
2367 float32 float32_rem(float32 a, float32 b, float_status *status)
2368 {
2369     flag aSign, zSign;
2370     int aExp, bExp, expDiff;
2371     uint32_t aSig, bSig;
2372     uint32_t q;
2373     uint64_t aSig64, bSig64, q64;
2374     uint32_t alternateASig;
2375     int32_t sigMean;
2376     a = float32_squash_input_denormal(a, status);
2377     b = float32_squash_input_denormal(b, status);
2378 
2379     aSig = extractFloat32Frac( a );
2380     aExp = extractFloat32Exp( a );
2381     aSign = extractFloat32Sign( a );
2382     bSig = extractFloat32Frac( b );
2383     bExp = extractFloat32Exp( b );
2384     if ( aExp == 0xFF ) {
2385         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386             return propagateFloat32NaN(a, b, status);
2387         }
2388         float_raise(float_flag_invalid, status);
2389         return float32_default_nan(status);
2390     }
2391     if ( bExp == 0xFF ) {
2392         if (bSig) {
2393             return propagateFloat32NaN(a, b, status);
2394         }
2395         return a;
2396     }
2397     if ( bExp == 0 ) {
2398         if ( bSig == 0 ) {
2399             float_raise(float_flag_invalid, status);
2400             return float32_default_nan(status);
2401         }
2402         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403     }
2404     if ( aExp == 0 ) {
2405         if ( aSig == 0 ) return a;
2406         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407     }
2408     expDiff = aExp - bExp;
2409     aSig |= 0x00800000;
2410     bSig |= 0x00800000;
2411     if ( expDiff < 32 ) {
2412         aSig <<= 8;
2413         bSig <<= 8;
2414         if ( expDiff < 0 ) {
2415             if ( expDiff < -1 ) return a;
2416             aSig >>= 1;
2417         }
2418         q = ( bSig <= aSig );
2419         if ( q ) aSig -= bSig;
2420         if ( 0 < expDiff ) {
2421             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422             q >>= 32 - expDiff;
2423             bSig >>= 2;
2424             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425         }
2426         else {
2427             aSig >>= 2;
2428             bSig >>= 2;
2429         }
2430     }
2431     else {
2432         if ( bSig <= aSig ) aSig -= bSig;
2433         aSig64 = ( (uint64_t) aSig )<<40;
2434         bSig64 = ( (uint64_t) bSig )<<40;
2435         expDiff -= 64;
2436         while ( 0 < expDiff ) {
2437             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439             aSig64 = - ( ( bSig * q64 )<<38 );
2440             expDiff -= 62;
2441         }
2442         expDiff += 64;
2443         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445         q = q64>>( 64 - expDiff );
2446         bSig <<= 6;
2447         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448     }
2449     do {
2450         alternateASig = aSig;
2451         ++q;
2452         aSig -= bSig;
2453     } while ( 0 <= (int32_t) aSig );
2454     sigMean = aSig + alternateASig;
2455     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456         aSig = alternateASig;
2457     }
2458     zSign = ( (int32_t) aSig < 0 );
2459     if ( zSign ) aSig = - aSig;
2460     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461 }
2462 
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication.  The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2473 
2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475                        float_status *status)
2476 {
2477     flag aSign, bSign, cSign, zSign;
2478     int aExp, bExp, cExp, pExp, zExp, expDiff;
2479     uint32_t aSig, bSig, cSig;
2480     flag pInf, pZero, pSign;
2481     uint64_t pSig64, cSig64, zSig64;
2482     uint32_t pSig;
2483     int shiftcount;
2484     flag signflip, infzero;
2485 
2486     a = float32_squash_input_denormal(a, status);
2487     b = float32_squash_input_denormal(b, status);
2488     c = float32_squash_input_denormal(c, status);
2489     aSig = extractFloat32Frac(a);
2490     aExp = extractFloat32Exp(a);
2491     aSign = extractFloat32Sign(a);
2492     bSig = extractFloat32Frac(b);
2493     bExp = extractFloat32Exp(b);
2494     bSign = extractFloat32Sign(b);
2495     cSig = extractFloat32Frac(c);
2496     cExp = extractFloat32Exp(c);
2497     cSign = extractFloat32Sign(c);
2498 
2499     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501 
2502     /* It is implementation-defined whether the cases of (0,inf,qnan)
2503      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504      * they return if they do), so we have to hand this information
2505      * off to the target-specific pick-a-NaN routine.
2506      */
2507     if (((aExp == 0xff) && aSig) ||
2508         ((bExp == 0xff) && bSig) ||
2509         ((cExp == 0xff) && cSig)) {
2510         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511     }
2512 
2513     if (infzero) {
2514         float_raise(float_flag_invalid, status);
2515         return float32_default_nan(status);
2516     }
2517 
2518     if (flags & float_muladd_negate_c) {
2519         cSign ^= 1;
2520     }
2521 
2522     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523 
2524     /* Work out the sign and type of the product */
2525     pSign = aSign ^ bSign;
2526     if (flags & float_muladd_negate_product) {
2527         pSign ^= 1;
2528     }
2529     pInf = (aExp == 0xff) || (bExp == 0xff);
2530     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531 
2532     if (cExp == 0xff) {
2533         if (pInf && (pSign ^ cSign)) {
2534             /* addition of opposite-signed infinities => InvalidOperation */
2535             float_raise(float_flag_invalid, status);
2536             return float32_default_nan(status);
2537         }
2538         /* Otherwise generate an infinity of the same sign */
2539         return packFloat32(cSign ^ signflip, 0xff, 0);
2540     }
2541 
2542     if (pInf) {
2543         return packFloat32(pSign ^ signflip, 0xff, 0);
2544     }
2545 
2546     if (pZero) {
2547         if (cExp == 0) {
2548             if (cSig == 0) {
2549                 /* Adding two exact zeroes */
2550                 if (pSign == cSign) {
2551                     zSign = pSign;
2552                 } else if (status->float_rounding_mode == float_round_down) {
2553                     zSign = 1;
2554                 } else {
2555                     zSign = 0;
2556                 }
2557                 return packFloat32(zSign ^ signflip, 0, 0);
2558             }
2559             /* Exact zero plus a denorm */
2560             if (status->flush_to_zero) {
2561                 float_raise(float_flag_output_denormal, status);
2562                 return packFloat32(cSign ^ signflip, 0, 0);
2563             }
2564         }
2565         /* Zero plus something non-zero : just return the something */
2566         if (flags & float_muladd_halve_result) {
2567             if (cExp == 0) {
2568                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569             }
2570             /* Subtract one to halve, and one again because roundAndPackFloat32
2571              * wants one less than the true exponent.
2572              */
2573             cExp -= 2;
2574             cSig = (cSig | 0x00800000) << 7;
2575             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576         }
2577         return packFloat32(cSign ^ signflip, cExp, cSig);
2578     }
2579 
2580     if (aExp == 0) {
2581         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582     }
2583     if (bExp == 0) {
2584         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585     }
2586 
2587     /* Calculate the actual result a * b + c */
2588 
2589     /* Multiply first; this is easy. */
2590     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591      * because we want the true exponent, not the "one-less-than"
2592      * flavour that roundAndPackFloat32() takes.
2593      */
2594     pExp = aExp + bExp - 0x7e;
2595     aSig = (aSig | 0x00800000) << 7;
2596     bSig = (bSig | 0x00800000) << 8;
2597     pSig64 = (uint64_t)aSig * bSig;
2598     if ((int64_t)(pSig64 << 1) >= 0) {
2599         pSig64 <<= 1;
2600         pExp--;
2601     }
2602 
2603     zSign = pSign ^ signflip;
2604 
2605     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606      * position 62.
2607      */
2608     if (cExp == 0) {
2609         if (!cSig) {
2610             /* Throw out the special case of c being an exact zero now */
2611             shift64RightJamming(pSig64, 32, &pSig64);
2612             pSig = pSig64;
2613             if (flags & float_muladd_halve_result) {
2614                 pExp--;
2615             }
2616             return roundAndPackFloat32(zSign, pExp - 1,
2617                                        pSig, status);
2618         }
2619         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620     }
2621 
2622     cSig64 = (uint64_t)cSig << (62 - 23);
2623     cSig64 |= LIT64(0x4000000000000000);
2624     expDiff = pExp - cExp;
2625 
2626     if (pSign == cSign) {
2627         /* Addition */
2628         if (expDiff > 0) {
2629             /* scale c to match p */
2630             shift64RightJamming(cSig64, expDiff, &cSig64);
2631             zExp = pExp;
2632         } else if (expDiff < 0) {
2633             /* scale p to match c */
2634             shift64RightJamming(pSig64, -expDiff, &pSig64);
2635             zExp = cExp;
2636         } else {
2637             /* no scaling needed */
2638             zExp = cExp;
2639         }
2640         /* Add significands and make sure explicit bit ends up in posn 62 */
2641         zSig64 = pSig64 + cSig64;
2642         if ((int64_t)zSig64 < 0) {
2643             shift64RightJamming(zSig64, 1, &zSig64);
2644         } else {
2645             zExp--;
2646         }
2647     } else {
2648         /* Subtraction */
2649         if (expDiff > 0) {
2650             shift64RightJamming(cSig64, expDiff, &cSig64);
2651             zSig64 = pSig64 - cSig64;
2652             zExp = pExp;
2653         } else if (expDiff < 0) {
2654             shift64RightJamming(pSig64, -expDiff, &pSig64);
2655             zSig64 = cSig64 - pSig64;
2656             zExp = cExp;
2657             zSign ^= 1;
2658         } else {
2659             zExp = pExp;
2660             if (cSig64 < pSig64) {
2661                 zSig64 = pSig64 - cSig64;
2662             } else if (pSig64 < cSig64) {
2663                 zSig64 = cSig64 - pSig64;
2664                 zSign ^= 1;
2665             } else {
2666                 /* Exact zero */
2667                 zSign = signflip;
2668                 if (status->float_rounding_mode == float_round_down) {
2669                     zSign ^= 1;
2670                 }
2671                 return packFloat32(zSign, 0, 0);
2672             }
2673         }
2674         --zExp;
2675         /* Normalize to put the explicit bit back into bit 62. */
2676         shiftcount = countLeadingZeros64(zSig64) - 1;
2677         zSig64 <<= shiftcount;
2678         zExp -= shiftcount;
2679     }
2680     if (flags & float_muladd_halve_result) {
2681         zExp--;
2682     }
2683 
2684     shift64RightJamming(zSig64, 32, &zSig64);
2685     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686 }
2687 
2688 
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694 
2695 float32 float32_sqrt(float32 a, float_status *status)
2696 {
2697     flag aSign;
2698     int aExp, zExp;
2699     uint32_t aSig, zSig;
2700     uint64_t rem, term;
2701     a = float32_squash_input_denormal(a, status);
2702 
2703     aSig = extractFloat32Frac( a );
2704     aExp = extractFloat32Exp( a );
2705     aSign = extractFloat32Sign( a );
2706     if ( aExp == 0xFF ) {
2707         if (aSig) {
2708             return propagateFloat32NaN(a, float32_zero, status);
2709         }
2710         if ( ! aSign ) return a;
2711         float_raise(float_flag_invalid, status);
2712         return float32_default_nan(status);
2713     }
2714     if ( aSign ) {
2715         if ( ( aExp | aSig ) == 0 ) return a;
2716         float_raise(float_flag_invalid, status);
2717         return float32_default_nan(status);
2718     }
2719     if ( aExp == 0 ) {
2720         if ( aSig == 0 ) return float32_zero;
2721         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722     }
2723     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724     aSig = ( aSig | 0x00800000 )<<8;
2725     zSig = estimateSqrt32( aExp, aSig ) + 2;
2726     if ( ( zSig & 0x7F ) <= 5 ) {
2727         if ( zSig < 2 ) {
2728             zSig = 0x7FFFFFFF;
2729             goto roundAndPack;
2730         }
2731         aSig >>= aExp & 1;
2732         term = ( (uint64_t) zSig ) * zSig;
2733         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734         while ( (int64_t) rem < 0 ) {
2735             --zSig;
2736             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737         }
2738         zSig |= ( rem != 0 );
2739     }
2740     shift32RightJamming( zSig, 1, &zSig );
2741  roundAndPack:
2742     return roundAndPackFloat32(0, zExp, zSig, status);
2743 
2744 }
2745 
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2750 |
2751 | Uses the following identities:
2752 |
2753 | 1. -------------------------------------------------------------------------
2754 |      x    x*ln(2)
2755 |     2  = e
2756 |
2757 | 2. -------------------------------------------------------------------------
2758 |                      2     3     4     5           n
2759 |      x        x     x     x     x     x           x
2760 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761 |               1!    2!    3!    4!    5!          n!
2762 *----------------------------------------------------------------------------*/
2763 
2764 static const float64 float32_exp2_coefficients[15] =
2765 {
2766     const_float64( 0x3ff0000000000000ll ), /*  1 */
2767     const_float64( 0x3fe0000000000000ll ), /*  2 */
2768     const_float64( 0x3fc5555555555555ll ), /*  3 */
2769     const_float64( 0x3fa5555555555555ll ), /*  4 */
2770     const_float64( 0x3f81111111111111ll ), /*  5 */
2771     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2772     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2773     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2774     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2775     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781 };
2782 
2783 float32 float32_exp2(float32 a, float_status *status)
2784 {
2785     flag aSign;
2786     int aExp;
2787     uint32_t aSig;
2788     float64 r, x, xn;
2789     int i;
2790     a = float32_squash_input_denormal(a, status);
2791 
2792     aSig = extractFloat32Frac( a );
2793     aExp = extractFloat32Exp( a );
2794     aSign = extractFloat32Sign( a );
2795 
2796     if ( aExp == 0xFF) {
2797         if (aSig) {
2798             return propagateFloat32NaN(a, float32_zero, status);
2799         }
2800         return (aSign) ? float32_zero : a;
2801     }
2802     if (aExp == 0) {
2803         if (aSig == 0) return float32_one;
2804     }
2805 
2806     float_raise(float_flag_inexact, status);
2807 
2808     /* ******************************* */
2809     /* using float64 for approximation */
2810     /* ******************************* */
2811     x = float32_to_float64(a, status);
2812     x = float64_mul(x, float64_ln2, status);
2813 
2814     xn = x;
2815     r = float64_one;
2816     for (i = 0 ; i < 15 ; i++) {
2817         float64 f;
2818 
2819         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820         r = float64_add(r, f, status);
2821 
2822         xn = float64_mul(xn, x, status);
2823     }
2824 
2825     return float64_to_float32(r, status);
2826 }
2827 
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32 float32_log2(float32 a, float_status *status)
2834 {
2835     flag aSign, zSign;
2836     int aExp;
2837     uint32_t aSig, zSig, i;
2838 
2839     a = float32_squash_input_denormal(a, status);
2840     aSig = extractFloat32Frac( a );
2841     aExp = extractFloat32Exp( a );
2842     aSign = extractFloat32Sign( a );
2843 
2844     if ( aExp == 0 ) {
2845         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847     }
2848     if ( aSign ) {
2849         float_raise(float_flag_invalid, status);
2850         return float32_default_nan(status);
2851     }
2852     if ( aExp == 0xFF ) {
2853         if (aSig) {
2854             return propagateFloat32NaN(a, float32_zero, status);
2855         }
2856         return a;
2857     }
2858 
2859     aExp -= 0x7F;
2860     aSig |= 0x00800000;
2861     zSign = aExp < 0;
2862     zSig = aExp << 23;
2863 
2864     for (i = 1 << 22; i > 0; i >>= 1) {
2865         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866         if ( aSig & 0x01000000 ) {
2867             aSig >>= 1;
2868             zSig |= i;
2869         }
2870     }
2871 
2872     if ( zSign )
2873         zSig = -zSig;
2874 
2875     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876 }
2877 
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2881 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884 
2885 int float32_eq(float32 a, float32 b, float_status *status)
2886 {
2887     uint32_t av, bv;
2888     a = float32_squash_input_denormal(a, status);
2889     b = float32_squash_input_denormal(b, status);
2890 
2891     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893        ) {
2894         float_raise(float_flag_invalid, status);
2895         return 0;
2896     }
2897     av = float32_val(a);
2898     bv = float32_val(b);
2899     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900 }
2901 
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2905 | exception is raised if either operand is a NaN.  The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908 
2909 int float32_le(float32 a, float32 b, float_status *status)
2910 {
2911     flag aSign, bSign;
2912     uint32_t av, bv;
2913     a = float32_squash_input_denormal(a, status);
2914     b = float32_squash_input_denormal(b, status);
2915 
2916     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918        ) {
2919         float_raise(float_flag_invalid, status);
2920         return 0;
2921     }
2922     aSign = extractFloat32Sign( a );
2923     bSign = extractFloat32Sign( b );
2924     av = float32_val(a);
2925     bv = float32_val(b);
2926     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928 
2929 }
2930 
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2934 | raised if either operand is a NaN.  The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2937 
2938 int float32_lt(float32 a, float32 b, float_status *status)
2939 {
2940     flag aSign, bSign;
2941     uint32_t av, bv;
2942     a = float32_squash_input_denormal(a, status);
2943     b = float32_squash_input_denormal(b, status);
2944 
2945     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947        ) {
2948         float_raise(float_flag_invalid, status);
2949         return 0;
2950     }
2951     aSign = extractFloat32Sign( a );
2952     bSign = extractFloat32Sign( b );
2953     av = float32_val(a);
2954     bv = float32_val(b);
2955     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957 
2958 }
2959 
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise.  The invalid exception is raised if either
2963 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966 
2967 int float32_unordered(float32 a, float32 b, float_status *status)
2968 {
2969     a = float32_squash_input_denormal(a, status);
2970     b = float32_squash_input_denormal(b, status);
2971 
2972     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974        ) {
2975         float_raise(float_flag_invalid, status);
2976         return 1;
2977     }
2978     return 0;
2979 }
2980 
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2984 | exception.  The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987 
2988 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989 {
2990     a = float32_squash_input_denormal(a, status);
2991     b = float32_squash_input_denormal(b, status);
2992 
2993     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995        ) {
2996         if (float32_is_signaling_nan(a, status)
2997          || float32_is_signaling_nan(b, status)) {
2998             float_raise(float_flag_invalid, status);
2999         }
3000         return 0;
3001     }
3002     return ( float32_val(a) == float32_val(b) ) ||
3003             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004 }
3005 
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3009 | cause an exception.  Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012 
3013 int float32_le_quiet(float32 a, float32 b, float_status *status)
3014 {
3015     flag aSign, bSign;
3016     uint32_t av, bv;
3017     a = float32_squash_input_denormal(a, status);
3018     b = float32_squash_input_denormal(b, status);
3019 
3020     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022        ) {
3023         if (float32_is_signaling_nan(a, status)
3024          || float32_is_signaling_nan(b, status)) {
3025             float_raise(float_flag_invalid, status);
3026         }
3027         return 0;
3028     }
3029     aSign = extractFloat32Sign( a );
3030     bSign = extractFloat32Sign( b );
3031     av = float32_val(a);
3032     bv = float32_val(b);
3033     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035 
3036 }
3037 
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3041 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044 
3045 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046 {
3047     flag aSign, bSign;
3048     uint32_t av, bv;
3049     a = float32_squash_input_denormal(a, status);
3050     b = float32_squash_input_denormal(b, status);
3051 
3052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054        ) {
3055         if (float32_is_signaling_nan(a, status)
3056          || float32_is_signaling_nan(b, status)) {
3057             float_raise(float_flag_invalid, status);
3058         }
3059         return 0;
3060     }
3061     aSign = extractFloat32Sign( a );
3062     bSign = extractFloat32Sign( b );
3063     av = float32_val(a);
3064     bv = float32_val(b);
3065     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067 
3068 }
3069 
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076 
3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078 {
3079     a = float32_squash_input_denormal(a, status);
3080     b = float32_squash_input_denormal(b, status);
3081 
3082     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084        ) {
3085         if (float32_is_signaling_nan(a, status)
3086          || float32_is_signaling_nan(b, status)) {
3087             float_raise(float_flag_invalid, status);
3088         }
3089         return 1;
3090     }
3091     return 0;
3092 }
3093 
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format.  The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode.  If `a' is a NaN, the largest
3100 | positive integer is returned.  Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3103 
3104 int32_t float64_to_int32(float64 a, float_status *status)
3105 {
3106     flag aSign;
3107     int aExp;
3108     int shiftCount;
3109     uint64_t aSig;
3110     a = float64_squash_input_denormal(a, status);
3111 
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117     shiftCount = 0x42C - aExp;
3118     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119     return roundAndPackInt32(aSign, aSig, status);
3120 
3121 }
3122 
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format.  The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3130 | returned.
3131 *----------------------------------------------------------------------------*/
3132 
3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134 {
3135     flag aSign;
3136     int aExp;
3137     int shiftCount;
3138     uint64_t aSig, savedASig;
3139     int32_t z;
3140     a = float64_squash_input_denormal(a, status);
3141 
3142     aSig = extractFloat64Frac( a );
3143     aExp = extractFloat64Exp( a );
3144     aSign = extractFloat64Sign( a );
3145     if ( 0x41E < aExp ) {
3146         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147         goto invalid;
3148     }
3149     else if ( aExp < 0x3FF ) {
3150         if (aExp || aSig) {
3151             status->float_exception_flags |= float_flag_inexact;
3152         }
3153         return 0;
3154     }
3155     aSig |= LIT64( 0x0010000000000000 );
3156     shiftCount = 0x433 - aExp;
3157     savedASig = aSig;
3158     aSig >>= shiftCount;
3159     z = aSig;
3160     if ( aSign ) z = - z;
3161     if ( ( z < 0 ) ^ aSign ) {
3162  invalid:
3163         float_raise(float_flag_invalid, status);
3164         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165     }
3166     if ( ( aSig<<shiftCount ) != savedASig ) {
3167         status->float_exception_flags |= float_flag_inexact;
3168     }
3169     return z;
3170 
3171 }
3172 
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format.  The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3180 | returned.
3181 *----------------------------------------------------------------------------*/
3182 
3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184 {
3185     flag aSign;
3186     int aExp;
3187     int shiftCount;
3188     uint64_t aSig, savedASig;
3189     int32_t z;
3190 
3191     aSig = extractFloat64Frac( a );
3192     aExp = extractFloat64Exp( a );
3193     aSign = extractFloat64Sign( a );
3194     if ( 0x40E < aExp ) {
3195         if ( ( aExp == 0x7FF ) && aSig ) {
3196             aSign = 0;
3197         }
3198         goto invalid;
3199     }
3200     else if ( aExp < 0x3FF ) {
3201         if ( aExp || aSig ) {
3202             status->float_exception_flags |= float_flag_inexact;
3203         }
3204         return 0;
3205     }
3206     aSig |= LIT64( 0x0010000000000000 );
3207     shiftCount = 0x433 - aExp;
3208     savedASig = aSig;
3209     aSig >>= shiftCount;
3210     z = aSig;
3211     if ( aSign ) {
3212         z = - z;
3213     }
3214     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215  invalid:
3216         float_raise(float_flag_invalid, status);
3217         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218     }
3219     if ( ( aSig<<shiftCount ) != savedASig ) {
3220         status->float_exception_flags |= float_flag_inexact;
3221     }
3222     return z;
3223 }
3224 
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format.  The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode.  If `a' is a NaN, the largest
3231 | positive integer is returned.  Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3234 
3235 int64_t float64_to_int64(float64 a, float_status *status)
3236 {
3237     flag aSign;
3238     int aExp;
3239     int shiftCount;
3240     uint64_t aSig, aSigExtra;
3241     a = float64_squash_input_denormal(a, status);
3242 
3243     aSig = extractFloat64Frac( a );
3244     aExp = extractFloat64Exp( a );
3245     aSign = extractFloat64Sign( a );
3246     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247     shiftCount = 0x433 - aExp;
3248     if ( shiftCount <= 0 ) {
3249         if ( 0x43E < aExp ) {
3250             float_raise(float_flag_invalid, status);
3251             if (    ! aSign
3252                  || (    ( aExp == 0x7FF )
3253                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254                ) {
3255                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256             }
3257             return (int64_t) LIT64( 0x8000000000000000 );
3258         }
3259         aSigExtra = 0;
3260         aSig <<= - shiftCount;
3261     }
3262     else {
3263         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264     }
3265     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266 
3267 }
3268 
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format.  The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3276 | returned.
3277 *----------------------------------------------------------------------------*/
3278 
3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280 {
3281     flag aSign;
3282     int aExp;
3283     int shiftCount;
3284     uint64_t aSig;
3285     int64_t z;
3286     a = float64_squash_input_denormal(a, status);
3287 
3288     aSig = extractFloat64Frac( a );
3289     aExp = extractFloat64Exp( a );
3290     aSign = extractFloat64Sign( a );
3291     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292     shiftCount = aExp - 0x433;
3293     if ( 0 <= shiftCount ) {
3294         if ( 0x43E <= aExp ) {
3295             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296                 float_raise(float_flag_invalid, status);
3297                 if (    ! aSign
3298                      || (    ( aExp == 0x7FF )
3299                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300                    ) {
3301                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3302                 }
3303             }
3304             return (int64_t) LIT64( 0x8000000000000000 );
3305         }
3306         z = aSig<<shiftCount;
3307     }
3308     else {
3309         if ( aExp < 0x3FE ) {
3310             if (aExp | aSig) {
3311                 status->float_exception_flags |= float_flag_inexact;
3312             }
3313             return 0;
3314         }
3315         z = aSig>>( - shiftCount );
3316         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317             status->float_exception_flags |= float_flag_inexact;
3318         }
3319     }
3320     if ( aSign ) z = - z;
3321     return z;
3322 
3323 }
3324 
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format.  The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3329 | Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331 
3332 float32 float64_to_float32(float64 a, float_status *status)
3333 {
3334     flag aSign;
3335     int aExp;
3336     uint64_t aSig;
3337     uint32_t zSig;
3338     a = float64_squash_input_denormal(a, status);
3339 
3340     aSig = extractFloat64Frac( a );
3341     aExp = extractFloat64Exp( a );
3342     aSign = extractFloat64Sign( a );
3343     if ( aExp == 0x7FF ) {
3344         if (aSig) {
3345             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346         }
3347         return packFloat32( aSign, 0xFF, 0 );
3348     }
3349     shift64RightJamming( aSig, 22, &aSig );
3350     zSig = aSig;
3351     if ( aExp || zSig ) {
3352         zSig |= 0x40000000;
3353         aExp -= 0x381;
3354     }
3355     return roundAndPackFloat32(aSign, aExp, zSig, status);
3356 
3357 }
3358 
3359 
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result.  After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result.  This means that any integer portion of `zSig'
3365 | will be added into the exponent.  Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3368 | significand.
3369 *----------------------------------------------------------------------------*/
3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371 {
3372     return make_float16(
3373         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3374 }
3375 
3376 /*----------------------------------------------------------------------------
3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378 | and significand `zSig', and returns the proper half-precision floating-
3379 | point value corresponding to the abstract input.  Ordinarily, the abstract
3380 | value is simply rounded and packed into the half-precision format, with
3381 | the inexact exception raised if the abstract input cannot be represented
3382 | exactly.  However, if the abstract value is too large, the overflow and
3383 | inexact exceptions are raised and an infinity or maximal finite value is
3384 | returned.  If the abstract value is too small, the input value is rounded to
3385 | a subnormal number, and the underflow and inexact exceptions are raised if
3386 | the abstract input cannot be represented exactly as a subnormal half-
3387 | precision floating-point number.
3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3389 | ARM-style "alternative representation", which omits the NaN and Inf
3390 | encodings in order to raise the maximum representable exponent by one.
3391 |     The input significand `zSig' has its binary point between bits 22
3392 | and 23, which is 13 bits to the left of the usual location.  This shifted
3393 | significand must be normalized or smaller.  If `zSig' is not normalized,
3394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3395 | and it must not require rounding.  In the usual case that `zSig' is
3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397 | Note the slightly odd position of the binary point in zSig compared with the
3398 | other roundAndPackFloat functions. This should probably be fixed if we
3399 | need to implement more float16 routines than just conversion.
3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3401 | Binary Floating-Point Arithmetic.
3402 *----------------------------------------------------------------------------*/
3403 
3404 static float16 roundAndPackFloat16(flag zSign, int zExp,
3405                                    uint32_t zSig, flag ieee,
3406                                    float_status *status)
3407 {
3408     int maxexp = ieee ? 29 : 30;
3409     uint32_t mask;
3410     uint32_t increment;
3411     bool rounding_bumps_exp;
3412     bool is_tiny = false;
3413 
3414     /* Calculate the mask of bits of the mantissa which are not
3415      * representable in half-precision and will be lost.
3416      */
3417     if (zExp < 1) {
3418         /* Will be denormal in halfprec */
3419         mask = 0x00ffffff;
3420         if (zExp >= -11) {
3421             mask >>= 11 + zExp;
3422         }
3423     } else {
3424         /* Normal number in halfprec */
3425         mask = 0x00001fff;
3426     }
3427 
3428     switch (status->float_rounding_mode) {
3429     case float_round_nearest_even:
3430         increment = (mask + 1) >> 1;
3431         if ((zSig & mask) == increment) {
3432             increment = zSig & (increment << 1);
3433         }
3434         break;
3435     case float_round_ties_away:
3436         increment = (mask + 1) >> 1;
3437         break;
3438     case float_round_up:
3439         increment = zSign ? 0 : mask;
3440         break;
3441     case float_round_down:
3442         increment = zSign ? mask : 0;
3443         break;
3444     default: /* round_to_zero */
3445         increment = 0;
3446         break;
3447     }
3448 
3449     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450 
3451     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452         if (ieee) {
3453             float_raise(float_flag_overflow | float_flag_inexact, status);
3454             return packFloat16(zSign, 0x1f, 0);
3455         } else {
3456             float_raise(float_flag_invalid, status);
3457             return packFloat16(zSign, 0x1f, 0x3ff);
3458         }
3459     }
3460 
3461     if (zExp < 0) {
3462         /* Note that flush-to-zero does not affect half-precision results */
3463         is_tiny =
3464             (status->float_detect_tininess == float_tininess_before_rounding)
3465             || (zExp < -1)
3466             || (!rounding_bumps_exp);
3467     }
3468     if (zSig & mask) {
3469         float_raise(float_flag_inexact, status);
3470         if (is_tiny) {
3471             float_raise(float_flag_underflow, status);
3472         }
3473     }
3474 
3475     zSig += increment;
3476     if (rounding_bumps_exp) {
3477         zSig >>= 1;
3478         zExp++;
3479     }
3480 
3481     if (zExp < -10) {
3482         return packFloat16(zSign, 0, 0);
3483     }
3484     if (zExp < 0) {
3485         zSig >>= -zExp;
3486         zExp = 0;
3487     }
3488     return packFloat16(zSign, zExp, zSig >> 13);
3489 }
3490 
3491 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3492                                       uint32_t *zSigPtr)
3493 {
3494     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3495     *zSigPtr = aSig << shiftCount;
3496     *zExpPtr = 1 - shiftCount;
3497 }
3498 
3499 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3501 
3502 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3503 {
3504     flag aSign;
3505     int aExp;
3506     uint32_t aSig;
3507 
3508     aSign = extractFloat16Sign(a);
3509     aExp = extractFloat16Exp(a);
3510     aSig = extractFloat16Frac(a);
3511 
3512     if (aExp == 0x1f && ieee) {
3513         if (aSig) {
3514             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3515         }
3516         return packFloat32(aSign, 0xff, 0);
3517     }
3518     if (aExp == 0) {
3519         if (aSig == 0) {
3520             return packFloat32(aSign, 0, 0);
3521         }
3522 
3523         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3524         aExp--;
3525     }
3526     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3527 }
3528 
3529 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3530 {
3531     flag aSign;
3532     int aExp;
3533     uint32_t aSig;
3534 
3535     a = float32_squash_input_denormal(a, status);
3536 
3537     aSig = extractFloat32Frac( a );
3538     aExp = extractFloat32Exp( a );
3539     aSign = extractFloat32Sign( a );
3540     if ( aExp == 0xFF ) {
3541         if (aSig) {
3542             /* Input is a NaN */
3543             if (!ieee) {
3544                 float_raise(float_flag_invalid, status);
3545                 return packFloat16(aSign, 0, 0);
3546             }
3547             return commonNaNToFloat16(
3548                 float32ToCommonNaN(a, status), status);
3549         }
3550         /* Infinity */
3551         if (!ieee) {
3552             float_raise(float_flag_invalid, status);
3553             return packFloat16(aSign, 0x1f, 0x3ff);
3554         }
3555         return packFloat16(aSign, 0x1f, 0);
3556     }
3557     if (aExp == 0 && aSig == 0) {
3558         return packFloat16(aSign, 0, 0);
3559     }
3560     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561      * even if the input is denormal; however this is harmless because
3562      * the largest possible single-precision denormal is still smaller
3563      * than the smallest representable half-precision denormal, and so we
3564      * will end up ignoring aSig and returning via the "always return zero"
3565      * codepath.
3566      */
3567     aSig |= 0x00800000;
3568     aExp -= 0x71;
3569 
3570     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3571 }
3572 
3573 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3574 {
3575     flag aSign;
3576     int aExp;
3577     uint32_t aSig;
3578 
3579     aSign = extractFloat16Sign(a);
3580     aExp = extractFloat16Exp(a);
3581     aSig = extractFloat16Frac(a);
3582 
3583     if (aExp == 0x1f && ieee) {
3584         if (aSig) {
3585             return commonNaNToFloat64(
3586                 float16ToCommonNaN(a, status), status);
3587         }
3588         return packFloat64(aSign, 0x7ff, 0);
3589     }
3590     if (aExp == 0) {
3591         if (aSig == 0) {
3592             return packFloat64(aSign, 0, 0);
3593         }
3594 
3595         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3596         aExp--;
3597     }
3598     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3599 }
3600 
3601 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3602 {
3603     flag aSign;
3604     int aExp;
3605     uint64_t aSig;
3606     uint32_t zSig;
3607 
3608     a = float64_squash_input_denormal(a, status);
3609 
3610     aSig = extractFloat64Frac(a);
3611     aExp = extractFloat64Exp(a);
3612     aSign = extractFloat64Sign(a);
3613     if (aExp == 0x7FF) {
3614         if (aSig) {
3615             /* Input is a NaN */
3616             if (!ieee) {
3617                 float_raise(float_flag_invalid, status);
3618                 return packFloat16(aSign, 0, 0);
3619             }
3620             return commonNaNToFloat16(
3621                 float64ToCommonNaN(a, status), status);
3622         }
3623         /* Infinity */
3624         if (!ieee) {
3625             float_raise(float_flag_invalid, status);
3626             return packFloat16(aSign, 0x1f, 0x3ff);
3627         }
3628         return packFloat16(aSign, 0x1f, 0);
3629     }
3630     shift64RightJamming(aSig, 29, &aSig);
3631     zSig = aSig;
3632     if (aExp == 0 && zSig == 0) {
3633         return packFloat16(aSign, 0, 0);
3634     }
3635     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636      * even if the input is denormal; however this is harmless because
3637      * the largest possible single-precision denormal is still smaller
3638      * than the smallest representable half-precision denormal, and so we
3639      * will end up ignoring aSig and returning via the "always return zero"
3640      * codepath.
3641      */
3642     zSig |= 0x00800000;
3643     aExp -= 0x3F1;
3644 
3645     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the extended double-precision floating-point format.  The conversion
3651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3654 
3655 floatx80 float64_to_floatx80(float64 a, float_status *status)
3656 {
3657     flag aSign;
3658     int aExp;
3659     uint64_t aSig;
3660 
3661     a = float64_squash_input_denormal(a, status);
3662     aSig = extractFloat64Frac( a );
3663     aExp = extractFloat64Exp( a );
3664     aSign = extractFloat64Sign( a );
3665     if ( aExp == 0x7FF ) {
3666         if (aSig) {
3667             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3668         }
3669         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670     }
3671     if ( aExp == 0 ) {
3672         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3673         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674     }
3675     return
3676         packFloatx80(
3677             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3678 
3679 }
3680 
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the double-precision floating-point value
3683 | `a' to the quadruple-precision floating-point format.  The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687 
3688 float128 float64_to_float128(float64 a, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint64_t aSig, zSig0, zSig1;
3693 
3694     a = float64_squash_input_denormal(a, status);
3695     aSig = extractFloat64Frac( a );
3696     aExp = extractFloat64Exp( a );
3697     aSign = extractFloat64Sign( a );
3698     if ( aExp == 0x7FF ) {
3699         if (aSig) {
3700             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3701         }
3702         return packFloat128( aSign, 0x7FFF, 0, 0 );
3703     }
3704     if ( aExp == 0 ) {
3705         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3707         --aExp;
3708     }
3709     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3710     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3711 
3712 }
3713 
3714 /*----------------------------------------------------------------------------
3715 | Rounds the double-precision floating-point value `a' to an integer, and
3716 | returns the result as a double-precision floating-point value.  The
3717 | operation is performed according to the IEC/IEEE Standard for Binary
3718 | Floating-Point Arithmetic.
3719 *----------------------------------------------------------------------------*/
3720 
3721 float64 float64_round_to_int(float64 a, float_status *status)
3722 {
3723     flag aSign;
3724     int aExp;
3725     uint64_t lastBitMask, roundBitsMask;
3726     uint64_t z;
3727     a = float64_squash_input_denormal(a, status);
3728 
3729     aExp = extractFloat64Exp( a );
3730     if ( 0x433 <= aExp ) {
3731         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3732             return propagateFloat64NaN(a, a, status);
3733         }
3734         return a;
3735     }
3736     if ( aExp < 0x3FF ) {
3737         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3738         status->float_exception_flags |= float_flag_inexact;
3739         aSign = extractFloat64Sign( a );
3740         switch (status->float_rounding_mode) {
3741          case float_round_nearest_even:
3742             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3743                 return packFloat64( aSign, 0x3FF, 0 );
3744             }
3745             break;
3746         case float_round_ties_away:
3747             if (aExp == 0x3FE) {
3748                 return packFloat64(aSign, 0x3ff, 0);
3749             }
3750             break;
3751          case float_round_down:
3752             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3753          case float_round_up:
3754             return make_float64(
3755             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3756         }
3757         return packFloat64( aSign, 0, 0 );
3758     }
3759     lastBitMask = 1;
3760     lastBitMask <<= 0x433 - aExp;
3761     roundBitsMask = lastBitMask - 1;
3762     z = float64_val(a);
3763     switch (status->float_rounding_mode) {
3764     case float_round_nearest_even:
3765         z += lastBitMask >> 1;
3766         if ((z & roundBitsMask) == 0) {
3767             z &= ~lastBitMask;
3768         }
3769         break;
3770     case float_round_ties_away:
3771         z += lastBitMask >> 1;
3772         break;
3773     case float_round_to_zero:
3774         break;
3775     case float_round_up:
3776         if (!extractFloat64Sign(make_float64(z))) {
3777             z += roundBitsMask;
3778         }
3779         break;
3780     case float_round_down:
3781         if (extractFloat64Sign(make_float64(z))) {
3782             z += roundBitsMask;
3783         }
3784         break;
3785     default:
3786         abort();
3787     }
3788     z &= ~ roundBitsMask;
3789     if (z != float64_val(a)) {
3790         status->float_exception_flags |= float_flag_inexact;
3791     }
3792     return make_float64(z);
3793 
3794 }
3795 
3796 float64 float64_trunc_to_int(float64 a, float_status *status)
3797 {
3798     int oldmode;
3799     float64 res;
3800     oldmode = status->float_rounding_mode;
3801     status->float_rounding_mode = float_round_to_zero;
3802     res = float64_round_to_int(a, status);
3803     status->float_rounding_mode = oldmode;
3804     return res;
3805 }
3806 
3807 /*----------------------------------------------------------------------------
3808 | Returns the result of adding the absolute values of the double-precision
3809 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3810 | before being returned.  `zSign' is ignored if the result is a NaN.
3811 | The addition is performed according to the IEC/IEEE Standard for Binary
3812 | Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3814 
3815 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3816                               float_status *status)
3817 {
3818     int aExp, bExp, zExp;
3819     uint64_t aSig, bSig, zSig;
3820     int expDiff;
3821 
3822     aSig = extractFloat64Frac( a );
3823     aExp = extractFloat64Exp( a );
3824     bSig = extractFloat64Frac( b );
3825     bExp = extractFloat64Exp( b );
3826     expDiff = aExp - bExp;
3827     aSig <<= 9;
3828     bSig <<= 9;
3829     if ( 0 < expDiff ) {
3830         if ( aExp == 0x7FF ) {
3831             if (aSig) {
3832                 return propagateFloat64NaN(a, b, status);
3833             }
3834             return a;
3835         }
3836         if ( bExp == 0 ) {
3837             --expDiff;
3838         }
3839         else {
3840             bSig |= LIT64( 0x2000000000000000 );
3841         }
3842         shift64RightJamming( bSig, expDiff, &bSig );
3843         zExp = aExp;
3844     }
3845     else if ( expDiff < 0 ) {
3846         if ( bExp == 0x7FF ) {
3847             if (bSig) {
3848                 return propagateFloat64NaN(a, b, status);
3849             }
3850             return packFloat64( zSign, 0x7FF, 0 );
3851         }
3852         if ( aExp == 0 ) {
3853             ++expDiff;
3854         }
3855         else {
3856             aSig |= LIT64( 0x2000000000000000 );
3857         }
3858         shift64RightJamming( aSig, - expDiff, &aSig );
3859         zExp = bExp;
3860     }
3861     else {
3862         if ( aExp == 0x7FF ) {
3863             if (aSig | bSig) {
3864                 return propagateFloat64NaN(a, b, status);
3865             }
3866             return a;
3867         }
3868         if ( aExp == 0 ) {
3869             if (status->flush_to_zero) {
3870                 if (aSig | bSig) {
3871                     float_raise(float_flag_output_denormal, status);
3872                 }
3873                 return packFloat64(zSign, 0, 0);
3874             }
3875             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3876         }
3877         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3878         zExp = aExp;
3879         goto roundAndPack;
3880     }
3881     aSig |= LIT64( 0x2000000000000000 );
3882     zSig = ( aSig + bSig )<<1;
3883     --zExp;
3884     if ( (int64_t) zSig < 0 ) {
3885         zSig = aSig + bSig;
3886         ++zExp;
3887     }
3888  roundAndPack:
3889     return roundAndPackFloat64(zSign, zExp, zSig, status);
3890 
3891 }
3892 
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of subtracting the absolute values of the double-
3895 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3896 | difference is negated before being returned.  `zSign' is ignored if the
3897 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3898 | Standard for Binary Floating-Point Arithmetic.
3899 *----------------------------------------------------------------------------*/
3900 
3901 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3902                               float_status *status)
3903 {
3904     int aExp, bExp, zExp;
3905     uint64_t aSig, bSig, zSig;
3906     int expDiff;
3907 
3908     aSig = extractFloat64Frac( a );
3909     aExp = extractFloat64Exp( a );
3910     bSig = extractFloat64Frac( b );
3911     bExp = extractFloat64Exp( b );
3912     expDiff = aExp - bExp;
3913     aSig <<= 10;
3914     bSig <<= 10;
3915     if ( 0 < expDiff ) goto aExpBigger;
3916     if ( expDiff < 0 ) goto bExpBigger;
3917     if ( aExp == 0x7FF ) {
3918         if (aSig | bSig) {
3919             return propagateFloat64NaN(a, b, status);
3920         }
3921         float_raise(float_flag_invalid, status);
3922         return float64_default_nan(status);
3923     }
3924     if ( aExp == 0 ) {
3925         aExp = 1;
3926         bExp = 1;
3927     }
3928     if ( bSig < aSig ) goto aBigger;
3929     if ( aSig < bSig ) goto bBigger;
3930     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3931  bExpBigger:
3932     if ( bExp == 0x7FF ) {
3933         if (bSig) {
3934             return propagateFloat64NaN(a, b, status);
3935         }
3936         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3937     }
3938     if ( aExp == 0 ) {
3939         ++expDiff;
3940     }
3941     else {
3942         aSig |= LIT64( 0x4000000000000000 );
3943     }
3944     shift64RightJamming( aSig, - expDiff, &aSig );
3945     bSig |= LIT64( 0x4000000000000000 );
3946  bBigger:
3947     zSig = bSig - aSig;
3948     zExp = bExp;
3949     zSign ^= 1;
3950     goto normalizeRoundAndPack;
3951  aExpBigger:
3952     if ( aExp == 0x7FF ) {
3953         if (aSig) {
3954             return propagateFloat64NaN(a, b, status);
3955         }
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         --expDiff;
3960     }
3961     else {
3962         bSig |= LIT64( 0x4000000000000000 );
3963     }
3964     shift64RightJamming( bSig, expDiff, &bSig );
3965     aSig |= LIT64( 0x4000000000000000 );
3966  aBigger:
3967     zSig = aSig - bSig;
3968     zExp = aExp;
3969  normalizeRoundAndPack:
3970     --zExp;
3971     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3972 
3973 }
3974 
3975 /*----------------------------------------------------------------------------
3976 | Returns the result of adding the double-precision floating-point values `a'
3977 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3978 | Binary Floating-Point Arithmetic.
3979 *----------------------------------------------------------------------------*/
3980 
3981 float64 float64_add(float64 a, float64 b, float_status *status)
3982 {
3983     flag aSign, bSign;
3984     a = float64_squash_input_denormal(a, status);
3985     b = float64_squash_input_denormal(b, status);
3986 
3987     aSign = extractFloat64Sign( a );
3988     bSign = extractFloat64Sign( b );
3989     if ( aSign == bSign ) {
3990         return addFloat64Sigs(a, b, aSign, status);
3991     }
3992     else {
3993         return subFloat64Sigs(a, b, aSign, status);
3994     }
3995 
3996 }
3997 
3998 /*----------------------------------------------------------------------------
3999 | Returns the result of subtracting the double-precision floating-point values
4000 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4001 | for Binary Floating-Point Arithmetic.
4002 *----------------------------------------------------------------------------*/
4003 
4004 float64 float64_sub(float64 a, float64 b, float_status *status)
4005 {
4006     flag aSign, bSign;
4007     a = float64_squash_input_denormal(a, status);
4008     b = float64_squash_input_denormal(b, status);
4009 
4010     aSign = extractFloat64Sign( a );
4011     bSign = extractFloat64Sign( b );
4012     if ( aSign == bSign ) {
4013         return subFloat64Sigs(a, b, aSign, status);
4014     }
4015     else {
4016         return addFloat64Sigs(a, b, aSign, status);
4017     }
4018 
4019 }
4020 
4021 /*----------------------------------------------------------------------------
4022 | Returns the result of multiplying the double-precision floating-point values
4023 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4024 | for Binary Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4026 
4027 float64 float64_mul(float64 a, float64 b, float_status *status)
4028 {
4029     flag aSign, bSign, zSign;
4030     int aExp, bExp, zExp;
4031     uint64_t aSig, bSig, zSig0, zSig1;
4032 
4033     a = float64_squash_input_denormal(a, status);
4034     b = float64_squash_input_denormal(b, status);
4035 
4036     aSig = extractFloat64Frac( a );
4037     aExp = extractFloat64Exp( a );
4038     aSign = extractFloat64Sign( a );
4039     bSig = extractFloat64Frac( b );
4040     bExp = extractFloat64Exp( b );
4041     bSign = extractFloat64Sign( b );
4042     zSign = aSign ^ bSign;
4043     if ( aExp == 0x7FF ) {
4044         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4045             return propagateFloat64NaN(a, b, status);
4046         }
4047         if ( ( bExp | bSig ) == 0 ) {
4048             float_raise(float_flag_invalid, status);
4049             return float64_default_nan(status);
4050         }
4051         return packFloat64( zSign, 0x7FF, 0 );
4052     }
4053     if ( bExp == 0x7FF ) {
4054         if (bSig) {
4055             return propagateFloat64NaN(a, b, status);
4056         }
4057         if ( ( aExp | aSig ) == 0 ) {
4058             float_raise(float_flag_invalid, status);
4059             return float64_default_nan(status);
4060         }
4061         return packFloat64( zSign, 0x7FF, 0 );
4062     }
4063     if ( aExp == 0 ) {
4064         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4065         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066     }
4067     if ( bExp == 0 ) {
4068         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4069         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4070     }
4071     zExp = aExp + bExp - 0x3FF;
4072     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4073     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4074     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4075     zSig0 |= ( zSig1 != 0 );
4076     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4077         zSig0 <<= 1;
4078         --zExp;
4079     }
4080     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Returns the result of dividing the double-precision floating-point value `a'
4086 | by the corresponding value `b'.  The operation is performed according to
4087 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089 
4090 float64 float64_div(float64 a, float64 b, float_status *status)
4091 {
4092     flag aSign, bSign, zSign;
4093     int aExp, bExp, zExp;
4094     uint64_t aSig, bSig, zSig;
4095     uint64_t rem0, rem1;
4096     uint64_t term0, term1;
4097     a = float64_squash_input_denormal(a, status);
4098     b = float64_squash_input_denormal(b, status);
4099 
4100     aSig = extractFloat64Frac( a );
4101     aExp = extractFloat64Exp( a );
4102     aSign = extractFloat64Sign( a );
4103     bSig = extractFloat64Frac( b );
4104     bExp = extractFloat64Exp( b );
4105     bSign = extractFloat64Sign( b );
4106     zSign = aSign ^ bSign;
4107     if ( aExp == 0x7FF ) {
4108         if (aSig) {
4109             return propagateFloat64NaN(a, b, status);
4110         }
4111         if ( bExp == 0x7FF ) {
4112             if (bSig) {
4113                 return propagateFloat64NaN(a, b, status);
4114             }
4115             float_raise(float_flag_invalid, status);
4116             return float64_default_nan(status);
4117         }
4118         return packFloat64( zSign, 0x7FF, 0 );
4119     }
4120     if ( bExp == 0x7FF ) {
4121         if (bSig) {
4122             return propagateFloat64NaN(a, b, status);
4123         }
4124         return packFloat64( zSign, 0, 0 );
4125     }
4126     if ( bExp == 0 ) {
4127         if ( bSig == 0 ) {
4128             if ( ( aExp | aSig ) == 0 ) {
4129                 float_raise(float_flag_invalid, status);
4130                 return float64_default_nan(status);
4131             }
4132             float_raise(float_flag_divbyzero, status);
4133             return packFloat64( zSign, 0x7FF, 0 );
4134         }
4135         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136     }
4137     if ( aExp == 0 ) {
4138         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4139         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140     }
4141     zExp = aExp - bExp + 0x3FD;
4142     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4143     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144     if ( bSig <= ( aSig + aSig ) ) {
4145         aSig >>= 1;
4146         ++zExp;
4147     }
4148     zSig = estimateDiv128To64( aSig, 0, bSig );
4149     if ( ( zSig & 0x1FF ) <= 2 ) {
4150         mul64To128( bSig, zSig, &term0, &term1 );
4151         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4152         while ( (int64_t) rem0 < 0 ) {
4153             --zSig;
4154             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4155         }
4156         zSig |= ( rem1 != 0 );
4157     }
4158     return roundAndPackFloat64(zSign, zExp, zSig, status);
4159 
4160 }
4161 
4162 /*----------------------------------------------------------------------------
4163 | Returns the remainder of the double-precision floating-point value `a'
4164 | with respect to the corresponding value `b'.  The operation is performed
4165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166 *----------------------------------------------------------------------------*/
4167 
4168 float64 float64_rem(float64 a, float64 b, float_status *status)
4169 {
4170     flag aSign, zSign;
4171     int aExp, bExp, expDiff;
4172     uint64_t aSig, bSig;
4173     uint64_t q, alternateASig;
4174     int64_t sigMean;
4175 
4176     a = float64_squash_input_denormal(a, status);
4177     b = float64_squash_input_denormal(b, status);
4178     aSig = extractFloat64Frac( a );
4179     aExp = extractFloat64Exp( a );
4180     aSign = extractFloat64Sign( a );
4181     bSig = extractFloat64Frac( b );
4182     bExp = extractFloat64Exp( b );
4183     if ( aExp == 0x7FF ) {
4184         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4185             return propagateFloat64NaN(a, b, status);
4186         }
4187         float_raise(float_flag_invalid, status);
4188         return float64_default_nan(status);
4189     }
4190     if ( bExp == 0x7FF ) {
4191         if (bSig) {
4192             return propagateFloat64NaN(a, b, status);
4193         }
4194         return a;
4195     }
4196     if ( bExp == 0 ) {
4197         if ( bSig == 0 ) {
4198             float_raise(float_flag_invalid, status);
4199             return float64_default_nan(status);
4200         }
4201         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4202     }
4203     if ( aExp == 0 ) {
4204         if ( aSig == 0 ) return a;
4205         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4206     }
4207     expDiff = aExp - bExp;
4208     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4209     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4210     if ( expDiff < 0 ) {
4211         if ( expDiff < -1 ) return a;
4212         aSig >>= 1;
4213     }
4214     q = ( bSig <= aSig );
4215     if ( q ) aSig -= bSig;
4216     expDiff -= 64;
4217     while ( 0 < expDiff ) {
4218         q = estimateDiv128To64( aSig, 0, bSig );
4219         q = ( 2 < q ) ? q - 2 : 0;
4220         aSig = - ( ( bSig>>2 ) * q );
4221         expDiff -= 62;
4222     }
4223     expDiff += 64;
4224     if ( 0 < expDiff ) {
4225         q = estimateDiv128To64( aSig, 0, bSig );
4226         q = ( 2 < q ) ? q - 2 : 0;
4227         q >>= 64 - expDiff;
4228         bSig >>= 2;
4229         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4230     }
4231     else {
4232         aSig >>= 2;
4233         bSig >>= 2;
4234     }
4235     do {
4236         alternateASig = aSig;
4237         ++q;
4238         aSig -= bSig;
4239     } while ( 0 <= (int64_t) aSig );
4240     sigMean = aSig + alternateASig;
4241     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4242         aSig = alternateASig;
4243     }
4244     zSign = ( (int64_t) aSig < 0 );
4245     if ( zSign ) aSig = - aSig;
4246     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4247 
4248 }
4249 
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of multiplying the double-precision floating-point values
4252 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4253 | multiplication.  The operation is performed according to the IEC/IEEE
4254 | Standard for Binary Floating-Point Arithmetic 754-2008.
4255 | The flags argument allows the caller to select negation of the
4256 | addend, the intermediate product, or the final result. (The difference
4257 | between this and having the caller do a separate negation is that negating
4258 | externally will flip the sign bit on NaNs.)
4259 *----------------------------------------------------------------------------*/
4260 
4261 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4262                        float_status *status)
4263 {
4264     flag aSign, bSign, cSign, zSign;
4265     int aExp, bExp, cExp, pExp, zExp, expDiff;
4266     uint64_t aSig, bSig, cSig;
4267     flag pInf, pZero, pSign;
4268     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4269     int shiftcount;
4270     flag signflip, infzero;
4271 
4272     a = float64_squash_input_denormal(a, status);
4273     b = float64_squash_input_denormal(b, status);
4274     c = float64_squash_input_denormal(c, status);
4275     aSig = extractFloat64Frac(a);
4276     aExp = extractFloat64Exp(a);
4277     aSign = extractFloat64Sign(a);
4278     bSig = extractFloat64Frac(b);
4279     bExp = extractFloat64Exp(b);
4280     bSign = extractFloat64Sign(b);
4281     cSig = extractFloat64Frac(c);
4282     cExp = extractFloat64Exp(c);
4283     cSign = extractFloat64Sign(c);
4284 
4285     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4286                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4287 
4288     /* It is implementation-defined whether the cases of (0,inf,qnan)
4289      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290      * they return if they do), so we have to hand this information
4291      * off to the target-specific pick-a-NaN routine.
4292      */
4293     if (((aExp == 0x7ff) && aSig) ||
4294         ((bExp == 0x7ff) && bSig) ||
4295         ((cExp == 0x7ff) && cSig)) {
4296         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4297     }
4298 
4299     if (infzero) {
4300         float_raise(float_flag_invalid, status);
4301         return float64_default_nan(status);
4302     }
4303 
4304     if (flags & float_muladd_negate_c) {
4305         cSign ^= 1;
4306     }
4307 
4308     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4309 
4310     /* Work out the sign and type of the product */
4311     pSign = aSign ^ bSign;
4312     if (flags & float_muladd_negate_product) {
4313         pSign ^= 1;
4314     }
4315     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4316     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4317 
4318     if (cExp == 0x7ff) {
4319         if (pInf && (pSign ^ cSign)) {
4320             /* addition of opposite-signed infinities => InvalidOperation */
4321             float_raise(float_flag_invalid, status);
4322             return float64_default_nan(status);
4323         }
4324         /* Otherwise generate an infinity of the same sign */
4325         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4326     }
4327 
4328     if (pInf) {
4329         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4330     }
4331 
4332     if (pZero) {
4333         if (cExp == 0) {
4334             if (cSig == 0) {
4335                 /* Adding two exact zeroes */
4336                 if (pSign == cSign) {
4337                     zSign = pSign;
4338                 } else if (status->float_rounding_mode == float_round_down) {
4339                     zSign = 1;
4340                 } else {
4341                     zSign = 0;
4342                 }
4343                 return packFloat64(zSign ^ signflip, 0, 0);
4344             }
4345             /* Exact zero plus a denorm */
4346             if (status->flush_to_zero) {
4347                 float_raise(float_flag_output_denormal, status);
4348                 return packFloat64(cSign ^ signflip, 0, 0);
4349             }
4350         }
4351         /* Zero plus something non-zero : just return the something */
4352         if (flags & float_muladd_halve_result) {
4353             if (cExp == 0) {
4354                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4355             }
4356             /* Subtract one to halve, and one again because roundAndPackFloat64
4357              * wants one less than the true exponent.
4358              */
4359             cExp -= 2;
4360             cSig = (cSig | 0x0010000000000000ULL) << 10;
4361             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4362         }
4363         return packFloat64(cSign ^ signflip, cExp, cSig);
4364     }
4365 
4366     if (aExp == 0) {
4367         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4368     }
4369     if (bExp == 0) {
4370         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4371     }
4372 
4373     /* Calculate the actual result a * b + c */
4374 
4375     /* Multiply first; this is easy. */
4376     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377      * because we want the true exponent, not the "one-less-than"
4378      * flavour that roundAndPackFloat64() takes.
4379      */
4380     pExp = aExp + bExp - 0x3fe;
4381     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4382     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4383     mul64To128(aSig, bSig, &pSig0, &pSig1);
4384     if ((int64_t)(pSig0 << 1) >= 0) {
4385         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4386         pExp--;
4387     }
4388 
4389     zSign = pSign ^ signflip;
4390 
4391     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392      * bit in position 126.
4393      */
4394     if (cExp == 0) {
4395         if (!cSig) {
4396             /* Throw out the special case of c being an exact zero now */
4397             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4398             if (flags & float_muladd_halve_result) {
4399                 pExp--;
4400             }
4401             return roundAndPackFloat64(zSign, pExp - 1,
4402                                        pSig1, status);
4403         }
4404         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4405     }
4406 
4407     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408      * significand of the addend, with the explicit bit in position 126.
4409      */
4410     cSig0 = cSig << (126 - 64 - 52);
4411     cSig1 = 0;
4412     cSig0 |= LIT64(0x4000000000000000);
4413     expDiff = pExp - cExp;
4414 
4415     if (pSign == cSign) {
4416         /* Addition */
4417         if (expDiff > 0) {
4418             /* scale c to match p */
4419             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4420             zExp = pExp;
4421         } else if (expDiff < 0) {
4422             /* scale p to match c */
4423             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4424             zExp = cExp;
4425         } else {
4426             /* no scaling needed */
4427             zExp = cExp;
4428         }
4429         /* Add significands and make sure explicit bit ends up in posn 126 */
4430         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4431         if ((int64_t)zSig0 < 0) {
4432             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4433         } else {
4434             zExp--;
4435         }
4436         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4437         if (flags & float_muladd_halve_result) {
4438             zExp--;
4439         }
4440         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4441     } else {
4442         /* Subtraction */
4443         if (expDiff > 0) {
4444             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4445             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446             zExp = pExp;
4447         } else if (expDiff < 0) {
4448             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4449             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4450             zExp = cExp;
4451             zSign ^= 1;
4452         } else {
4453             zExp = pExp;
4454             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4455                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4456             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4457                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4458                 zSign ^= 1;
4459             } else {
4460                 /* Exact zero */
4461                 zSign = signflip;
4462                 if (status->float_rounding_mode == float_round_down) {
4463                     zSign ^= 1;
4464                 }
4465                 return packFloat64(zSign, 0, 0);
4466             }
4467         }
4468         --zExp;
4469         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470          * starting with the significand in a pair of uint64_t.
4471          */
4472         if (zSig0) {
4473             shiftcount = countLeadingZeros64(zSig0) - 1;
4474             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4475             if (zSig1) {
4476                 zSig0 |= 1;
4477             }
4478             zExp -= shiftcount;
4479         } else {
4480             shiftcount = countLeadingZeros64(zSig1);
4481             if (shiftcount == 0) {
4482                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4483                 zExp -= 63;
4484             } else {
4485                 shiftcount--;
4486                 zSig0 = zSig1 << shiftcount;
4487                 zExp -= (shiftcount + 64);
4488             }
4489         }
4490         if (flags & float_muladd_halve_result) {
4491             zExp--;
4492         }
4493         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4494     }
4495 }
4496 
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the double-precision floating-point value `a'.
4499 | The operation is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502 
4503 float64 float64_sqrt(float64 a, float_status *status)
4504 {
4505     flag aSign;
4506     int aExp, zExp;
4507     uint64_t aSig, zSig, doubleZSig;
4508     uint64_t rem0, rem1, term0, term1;
4509     a = float64_squash_input_denormal(a, status);
4510 
4511     aSig = extractFloat64Frac( a );
4512     aExp = extractFloat64Exp( a );
4513     aSign = extractFloat64Sign( a );
4514     if ( aExp == 0x7FF ) {
4515         if (aSig) {
4516             return propagateFloat64NaN(a, a, status);
4517         }
4518         if ( ! aSign ) return a;
4519         float_raise(float_flag_invalid, status);
4520         return float64_default_nan(status);
4521     }
4522     if ( aSign ) {
4523         if ( ( aExp | aSig ) == 0 ) return a;
4524         float_raise(float_flag_invalid, status);
4525         return float64_default_nan(status);
4526     }
4527     if ( aExp == 0 ) {
4528         if ( aSig == 0 ) return float64_zero;
4529         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4530     }
4531     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4532     aSig |= LIT64( 0x0010000000000000 );
4533     zSig = estimateSqrt32( aExp, aSig>>21 );
4534     aSig <<= 9 - ( aExp & 1 );
4535     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4536     if ( ( zSig & 0x1FF ) <= 5 ) {
4537         doubleZSig = zSig<<1;
4538         mul64To128( zSig, zSig, &term0, &term1 );
4539         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4540         while ( (int64_t) rem0 < 0 ) {
4541             --zSig;
4542             doubleZSig -= 2;
4543             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4544         }
4545         zSig |= ( ( rem0 | rem1 ) != 0 );
4546     }
4547     return roundAndPackFloat64(0, zExp, zSig, status);
4548 
4549 }
4550 
4551 /*----------------------------------------------------------------------------
4552 | Returns the binary log of the double-precision floating-point value `a'.
4553 | The operation is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4556 float64 float64_log2(float64 a, float_status *status)
4557 {
4558     flag aSign, zSign;
4559     int aExp;
4560     uint64_t aSig, aSig0, aSig1, zSig, i;
4561     a = float64_squash_input_denormal(a, status);
4562 
4563     aSig = extractFloat64Frac( a );
4564     aExp = extractFloat64Exp( a );
4565     aSign = extractFloat64Sign( a );
4566 
4567     if ( aExp == 0 ) {
4568         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4569         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4570     }
4571     if ( aSign ) {
4572         float_raise(float_flag_invalid, status);
4573         return float64_default_nan(status);
4574     }
4575     if ( aExp == 0x7FF ) {
4576         if (aSig) {
4577             return propagateFloat64NaN(a, float64_zero, status);
4578         }
4579         return a;
4580     }
4581 
4582     aExp -= 0x3FF;
4583     aSig |= LIT64( 0x0010000000000000 );
4584     zSign = aExp < 0;
4585     zSig = (uint64_t)aExp << 52;
4586     for (i = 1LL << 51; i > 0; i >>= 1) {
4587         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4588         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4589         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4590             aSig >>= 1;
4591             zSig |= i;
4592         }
4593     }
4594 
4595     if ( zSign )
4596         zSig = -zSig;
4597     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4598 }
4599 
4600 /*----------------------------------------------------------------------------
4601 | Returns 1 if the double-precision floating-point value `a' is equal to the
4602 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4603 | if either operand is a NaN.  Otherwise, the comparison is performed
4604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605 *----------------------------------------------------------------------------*/
4606 
4607 int float64_eq(float64 a, float64 b, float_status *status)
4608 {
4609     uint64_t av, bv;
4610     a = float64_squash_input_denormal(a, status);
4611     b = float64_squash_input_denormal(b, status);
4612 
4613     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4614          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4615        ) {
4616         float_raise(float_flag_invalid, status);
4617         return 0;
4618     }
4619     av = float64_val(a);
4620     bv = float64_val(b);
4621     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4622 
4623 }
4624 
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the double-precision floating-point value `a' is less than or
4627 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4628 | exception is raised if either operand is a NaN.  The comparison is performed
4629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631 
4632 int float64_le(float64 a, float64 b, float_status *status)
4633 {
4634     flag aSign, bSign;
4635     uint64_t av, bv;
4636     a = float64_squash_input_denormal(a, status);
4637     b = float64_squash_input_denormal(b, status);
4638 
4639     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4640          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4641        ) {
4642         float_raise(float_flag_invalid, status);
4643         return 0;
4644     }
4645     aSign = extractFloat64Sign( a );
4646     bSign = extractFloat64Sign( b );
4647     av = float64_val(a);
4648     bv = float64_val(b);
4649     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4650     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4651 
4652 }
4653 
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the double-precision floating-point value `a' is less than
4656 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4657 | raised if either operand is a NaN.  The comparison is performed according
4658 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660 
4661 int float64_lt(float64 a, float64 b, float_status *status)
4662 {
4663     flag aSign, bSign;
4664     uint64_t av, bv;
4665 
4666     a = float64_squash_input_denormal(a, status);
4667     b = float64_squash_input_denormal(b, status);
4668     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4669          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4670        ) {
4671         float_raise(float_flag_invalid, status);
4672         return 0;
4673     }
4674     aSign = extractFloat64Sign( a );
4675     bSign = extractFloat64Sign( b );
4676     av = float64_val(a);
4677     bv = float64_val(b);
4678     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4679     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4680 
4681 }
4682 
4683 /*----------------------------------------------------------------------------
4684 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4685 | be compared, and 0 otherwise.  The invalid exception is raised if either
4686 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4687 | Standard for Binary Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4689 
4690 int float64_unordered(float64 a, float64 b, float_status *status)
4691 {
4692     a = float64_squash_input_denormal(a, status);
4693     b = float64_squash_input_denormal(b, status);
4694 
4695     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4696          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4697        ) {
4698         float_raise(float_flag_invalid, status);
4699         return 1;
4700     }
4701     return 0;
4702 }
4703 
4704 /*----------------------------------------------------------------------------
4705 | Returns 1 if the double-precision floating-point value `a' is equal to the
4706 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4707 | exception.The comparison is performed according to the IEC/IEEE Standard
4708 | for Binary Floating-Point Arithmetic.
4709 *----------------------------------------------------------------------------*/
4710 
4711 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4712 {
4713     uint64_t av, bv;
4714     a = float64_squash_input_denormal(a, status);
4715     b = float64_squash_input_denormal(b, status);
4716 
4717     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4718          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4719        ) {
4720         if (float64_is_signaling_nan(a, status)
4721          || float64_is_signaling_nan(b, status)) {
4722             float_raise(float_flag_invalid, status);
4723         }
4724         return 0;
4725     }
4726     av = float64_val(a);
4727     bv = float64_val(b);
4728     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4729 
4730 }
4731 
4732 /*----------------------------------------------------------------------------
4733 | Returns 1 if the double-precision floating-point value `a' is less than or
4734 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4735 | cause an exception.  Otherwise, the comparison is performed according to the
4736 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4738 
4739 int float64_le_quiet(float64 a, float64 b, float_status *status)
4740 {
4741     flag aSign, bSign;
4742     uint64_t av, bv;
4743     a = float64_squash_input_denormal(a, status);
4744     b = float64_squash_input_denormal(b, status);
4745 
4746     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4747          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4748        ) {
4749         if (float64_is_signaling_nan(a, status)
4750          || float64_is_signaling_nan(b, status)) {
4751             float_raise(float_flag_invalid, status);
4752         }
4753         return 0;
4754     }
4755     aSign = extractFloat64Sign( a );
4756     bSign = extractFloat64Sign( b );
4757     av = float64_val(a);
4758     bv = float64_val(b);
4759     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4760     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4761 
4762 }
4763 
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the double-precision floating-point value `a' is less than
4766 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4767 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4768 | Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770 
4771 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4772 {
4773     flag aSign, bSign;
4774     uint64_t av, bv;
4775     a = float64_squash_input_denormal(a, status);
4776     b = float64_squash_input_denormal(b, status);
4777 
4778     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4779          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4780        ) {
4781         if (float64_is_signaling_nan(a, status)
4782          || float64_is_signaling_nan(b, status)) {
4783             float_raise(float_flag_invalid, status);
4784         }
4785         return 0;
4786     }
4787     aSign = extractFloat64Sign( a );
4788     bSign = extractFloat64Sign( b );
4789     av = float64_val(a);
4790     bv = float64_val(b);
4791     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4792     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4793 
4794 }
4795 
4796 /*----------------------------------------------------------------------------
4797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4799 | comparison is performed according to the IEC/IEEE Standard for Binary
4800 | Floating-Point Arithmetic.
4801 *----------------------------------------------------------------------------*/
4802 
4803 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4804 {
4805     a = float64_squash_input_denormal(a, status);
4806     b = float64_squash_input_denormal(b, status);
4807 
4808     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4809          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4810        ) {
4811         if (float64_is_signaling_nan(a, status)
4812          || float64_is_signaling_nan(b, status)) {
4813             float_raise(float_flag_invalid, status);
4814         }
4815         return 1;
4816     }
4817     return 0;
4818 }
4819 
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the extended double-precision floating-
4822 | point value `a' to the 32-bit two's complement integer format.  The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic---which means in particular that the conversion
4825 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4826 | largest positive integer is returned.  Otherwise, if the conversion
4827 | overflows, the largest integer with the same sign as `a' is returned.
4828 *----------------------------------------------------------------------------*/
4829 
4830 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4831 {
4832     flag aSign;
4833     int32_t aExp, shiftCount;
4834     uint64_t aSig;
4835 
4836     if (floatx80_invalid_encoding(a)) {
4837         float_raise(float_flag_invalid, status);
4838         return 1 << 31;
4839     }
4840     aSig = extractFloatx80Frac( a );
4841     aExp = extractFloatx80Exp( a );
4842     aSign = extractFloatx80Sign( a );
4843     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4844     shiftCount = 0x4037 - aExp;
4845     if ( shiftCount <= 0 ) shiftCount = 1;
4846     shift64RightJamming( aSig, shiftCount, &aSig );
4847     return roundAndPackInt32(aSign, aSig, status);
4848 
4849 }
4850 
4851 /*----------------------------------------------------------------------------
4852 | Returns the result of converting the extended double-precision floating-
4853 | point value `a' to the 32-bit two's complement integer format.  The
4854 | conversion is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic, except that the conversion is always rounded
4856 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4857 | Otherwise, if the conversion overflows, the largest integer with the same
4858 | sign as `a' is returned.
4859 *----------------------------------------------------------------------------*/
4860 
4861 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4862 {
4863     flag aSign;
4864     int32_t aExp, shiftCount;
4865     uint64_t aSig, savedASig;
4866     int32_t z;
4867 
4868     if (floatx80_invalid_encoding(a)) {
4869         float_raise(float_flag_invalid, status);
4870         return 1 << 31;
4871     }
4872     aSig = extractFloatx80Frac( a );
4873     aExp = extractFloatx80Exp( a );
4874     aSign = extractFloatx80Sign( a );
4875     if ( 0x401E < aExp ) {
4876         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4877         goto invalid;
4878     }
4879     else if ( aExp < 0x3FFF ) {
4880         if (aExp || aSig) {
4881             status->float_exception_flags |= float_flag_inexact;
4882         }
4883         return 0;
4884     }
4885     shiftCount = 0x403E - aExp;
4886     savedASig = aSig;
4887     aSig >>= shiftCount;
4888     z = aSig;
4889     if ( aSign ) z = - z;
4890     if ( ( z < 0 ) ^ aSign ) {
4891  invalid:
4892         float_raise(float_flag_invalid, status);
4893         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4894     }
4895     if ( ( aSig<<shiftCount ) != savedASig ) {
4896         status->float_exception_flags |= float_flag_inexact;
4897     }
4898     return z;
4899 
4900 }
4901 
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of converting the extended double-precision floating-
4904 | point value `a' to the 64-bit two's complement integer format.  The
4905 | conversion is performed according to the IEC/IEEE Standard for Binary
4906 | Floating-Point Arithmetic---which means in particular that the conversion
4907 | is rounded according to the current rounding mode.  If `a' is a NaN,
4908 | the largest positive integer is returned.  Otherwise, if the conversion
4909 | overflows, the largest integer with the same sign as `a' is returned.
4910 *----------------------------------------------------------------------------*/
4911 
4912 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4913 {
4914     flag aSign;
4915     int32_t aExp, shiftCount;
4916     uint64_t aSig, aSigExtra;
4917 
4918     if (floatx80_invalid_encoding(a)) {
4919         float_raise(float_flag_invalid, status);
4920         return 1ULL << 63;
4921     }
4922     aSig = extractFloatx80Frac( a );
4923     aExp = extractFloatx80Exp( a );
4924     aSign = extractFloatx80Sign( a );
4925     shiftCount = 0x403E - aExp;
4926     if ( shiftCount <= 0 ) {
4927         if ( shiftCount ) {
4928             float_raise(float_flag_invalid, status);
4929             if (    ! aSign
4930                  || (    ( aExp == 0x7FFF )
4931                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4932                ) {
4933                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934             }
4935             return (int64_t) LIT64( 0x8000000000000000 );
4936         }
4937         aSigExtra = 0;
4938     }
4939     else {
4940         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4941     }
4942     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4943 
4944 }
4945 
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the extended double-precision floating-
4948 | point value `a' to the 64-bit two's complement integer format.  The
4949 | conversion is performed according to the IEC/IEEE Standard for Binary
4950 | Floating-Point Arithmetic, except that the conversion is always rounded
4951 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4952 | Otherwise, if the conversion overflows, the largest integer with the same
4953 | sign as `a' is returned.
4954 *----------------------------------------------------------------------------*/
4955 
4956 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4957 {
4958     flag aSign;
4959     int32_t aExp, shiftCount;
4960     uint64_t aSig;
4961     int64_t z;
4962 
4963     if (floatx80_invalid_encoding(a)) {
4964         float_raise(float_flag_invalid, status);
4965         return 1ULL << 63;
4966     }
4967     aSig = extractFloatx80Frac( a );
4968     aExp = extractFloatx80Exp( a );
4969     aSign = extractFloatx80Sign( a );
4970     shiftCount = aExp - 0x403E;
4971     if ( 0 <= shiftCount ) {
4972         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4973         if ( ( a.high != 0xC03E ) || aSig ) {
4974             float_raise(float_flag_invalid, status);
4975             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4976                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4977             }
4978         }
4979         return (int64_t) LIT64( 0x8000000000000000 );
4980     }
4981     else if ( aExp < 0x3FFF ) {
4982         if (aExp | aSig) {
4983             status->float_exception_flags |= float_flag_inexact;
4984         }
4985         return 0;
4986     }
4987     z = aSig>>( - shiftCount );
4988     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4989         status->float_exception_flags |= float_flag_inexact;
4990     }
4991     if ( aSign ) z = - z;
4992     return z;
4993 
4994 }
4995 
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of converting the extended double-precision floating-
4998 | point value `a' to the single-precision floating-point format.  The
4999 | conversion is performed according to the IEC/IEEE Standard for Binary
5000 | Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5002 
5003 float32 floatx80_to_float32(floatx80 a, float_status *status)
5004 {
5005     flag aSign;
5006     int32_t aExp;
5007     uint64_t aSig;
5008 
5009     if (floatx80_invalid_encoding(a)) {
5010         float_raise(float_flag_invalid, status);
5011         return float32_default_nan(status);
5012     }
5013     aSig = extractFloatx80Frac( a );
5014     aExp = extractFloatx80Exp( a );
5015     aSign = extractFloatx80Sign( a );
5016     if ( aExp == 0x7FFF ) {
5017         if ( (uint64_t) ( aSig<<1 ) ) {
5018             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5019         }
5020         return packFloat32( aSign, 0xFF, 0 );
5021     }
5022     shift64RightJamming( aSig, 33, &aSig );
5023     if ( aExp || aSig ) aExp -= 0x3F81;
5024     return roundAndPackFloat32(aSign, aExp, aSig, status);
5025 
5026 }
5027 
5028 /*----------------------------------------------------------------------------
5029 | Returns the result of converting the extended double-precision floating-
5030 | point value `a' to the double-precision floating-point format.  The
5031 | conversion is performed according to the IEC/IEEE Standard for Binary
5032 | Floating-Point Arithmetic.
5033 *----------------------------------------------------------------------------*/
5034 
5035 float64 floatx80_to_float64(floatx80 a, float_status *status)
5036 {
5037     flag aSign;
5038     int32_t aExp;
5039     uint64_t aSig, zSig;
5040 
5041     if (floatx80_invalid_encoding(a)) {
5042         float_raise(float_flag_invalid, status);
5043         return float64_default_nan(status);
5044     }
5045     aSig = extractFloatx80Frac( a );
5046     aExp = extractFloatx80Exp( a );
5047     aSign = extractFloatx80Sign( a );
5048     if ( aExp == 0x7FFF ) {
5049         if ( (uint64_t) ( aSig<<1 ) ) {
5050             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5051         }
5052         return packFloat64( aSign, 0x7FF, 0 );
5053     }
5054     shift64RightJamming( aSig, 1, &zSig );
5055     if ( aExp || aSig ) aExp -= 0x3C01;
5056     return roundAndPackFloat64(aSign, aExp, zSig, status);
5057 
5058 }
5059 
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of converting the extended double-precision floating-
5062 | point value `a' to the quadruple-precision floating-point format.  The
5063 | conversion is performed according to the IEC/IEEE Standard for Binary
5064 | Floating-Point Arithmetic.
5065 *----------------------------------------------------------------------------*/
5066 
5067 float128 floatx80_to_float128(floatx80 a, float_status *status)
5068 {
5069     flag aSign;
5070     int aExp;
5071     uint64_t aSig, zSig0, zSig1;
5072 
5073     if (floatx80_invalid_encoding(a)) {
5074         float_raise(float_flag_invalid, status);
5075         return float128_default_nan(status);
5076     }
5077     aSig = extractFloatx80Frac( a );
5078     aExp = extractFloatx80Exp( a );
5079     aSign = extractFloatx80Sign( a );
5080     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5081         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5082     }
5083     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5084     return packFloat128( aSign, aExp, zSig0, zSig1 );
5085 
5086 }
5087 
5088 /*----------------------------------------------------------------------------
5089 | Rounds the extended double-precision floating-point value `a' to an integer,
5090 | and returns the result as an extended quadruple-precision floating-point
5091 | value.  The operation is performed according to the IEC/IEEE Standard for
5092 | Binary Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094 
5095 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5096 {
5097     flag aSign;
5098     int32_t aExp;
5099     uint64_t lastBitMask, roundBitsMask;
5100     floatx80 z;
5101 
5102     if (floatx80_invalid_encoding(a)) {
5103         float_raise(float_flag_invalid, status);
5104         return floatx80_default_nan(status);
5105     }
5106     aExp = extractFloatx80Exp( a );
5107     if ( 0x403E <= aExp ) {
5108         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5109             return propagateFloatx80NaN(a, a, status);
5110         }
5111         return a;
5112     }
5113     if ( aExp < 0x3FFF ) {
5114         if (    ( aExp == 0 )
5115              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5116             return a;
5117         }
5118         status->float_exception_flags |= float_flag_inexact;
5119         aSign = extractFloatx80Sign( a );
5120         switch (status->float_rounding_mode) {
5121          case float_round_nearest_even:
5122             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5123                ) {
5124                 return
5125                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5126             }
5127             break;
5128         case float_round_ties_away:
5129             if (aExp == 0x3FFE) {
5130                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5131             }
5132             break;
5133          case float_round_down:
5134             return
5135                   aSign ?
5136                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5137                 : packFloatx80( 0, 0, 0 );
5138          case float_round_up:
5139             return
5140                   aSign ? packFloatx80( 1, 0, 0 )
5141                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5142         }
5143         return packFloatx80( aSign, 0, 0 );
5144     }
5145     lastBitMask = 1;
5146     lastBitMask <<= 0x403E - aExp;
5147     roundBitsMask = lastBitMask - 1;
5148     z = a;
5149     switch (status->float_rounding_mode) {
5150     case float_round_nearest_even:
5151         z.low += lastBitMask>>1;
5152         if ((z.low & roundBitsMask) == 0) {
5153             z.low &= ~lastBitMask;
5154         }
5155         break;
5156     case float_round_ties_away:
5157         z.low += lastBitMask >> 1;
5158         break;
5159     case float_round_to_zero:
5160         break;
5161     case float_round_up:
5162         if (!extractFloatx80Sign(z)) {
5163             z.low += roundBitsMask;
5164         }
5165         break;
5166     case float_round_down:
5167         if (extractFloatx80Sign(z)) {
5168             z.low += roundBitsMask;
5169         }
5170         break;
5171     default:
5172         abort();
5173     }
5174     z.low &= ~ roundBitsMask;
5175     if ( z.low == 0 ) {
5176         ++z.high;
5177         z.low = LIT64( 0x8000000000000000 );
5178     }
5179     if (z.low != a.low) {
5180         status->float_exception_flags |= float_flag_inexact;
5181     }
5182     return z;
5183 
5184 }
5185 
5186 /*----------------------------------------------------------------------------
5187 | Returns the result of adding the absolute values of the extended double-
5188 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5189 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5190 | The addition is performed according to the IEC/IEEE Standard for Binary
5191 | Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5193 
5194 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195                                 float_status *status)
5196 {
5197     int32_t aExp, bExp, zExp;
5198     uint64_t aSig, bSig, zSig0, zSig1;
5199     int32_t expDiff;
5200 
5201     aSig = extractFloatx80Frac( a );
5202     aExp = extractFloatx80Exp( a );
5203     bSig = extractFloatx80Frac( b );
5204     bExp = extractFloatx80Exp( b );
5205     expDiff = aExp - bExp;
5206     if ( 0 < expDiff ) {
5207         if ( aExp == 0x7FFF ) {
5208             if ((uint64_t)(aSig << 1)) {
5209                 return propagateFloatx80NaN(a, b, status);
5210             }
5211             return a;
5212         }
5213         if ( bExp == 0 ) --expDiff;
5214         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5215         zExp = aExp;
5216     }
5217     else if ( expDiff < 0 ) {
5218         if ( bExp == 0x7FFF ) {
5219             if ((uint64_t)(bSig << 1)) {
5220                 return propagateFloatx80NaN(a, b, status);
5221             }
5222             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5223         }
5224         if ( aExp == 0 ) ++expDiff;
5225         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5226         zExp = bExp;
5227     }
5228     else {
5229         if ( aExp == 0x7FFF ) {
5230             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5231                 return propagateFloatx80NaN(a, b, status);
5232             }
5233             return a;
5234         }
5235         zSig1 = 0;
5236         zSig0 = aSig + bSig;
5237         if ( aExp == 0 ) {
5238             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5239             goto roundAndPack;
5240         }
5241         zExp = aExp;
5242         goto shiftRight1;
5243     }
5244     zSig0 = aSig + bSig;
5245     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5246  shiftRight1:
5247     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5248     zSig0 |= LIT64( 0x8000000000000000 );
5249     ++zExp;
5250  roundAndPack:
5251     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5252                                 zSign, zExp, zSig0, zSig1, status);
5253 }
5254 
5255 /*----------------------------------------------------------------------------
5256 | Returns the result of subtracting the absolute values of the extended
5257 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5258 | difference is negated before being returned.  `zSign' is ignored if the
5259 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5262 
5263 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5264                                 float_status *status)
5265 {
5266     int32_t aExp, bExp, zExp;
5267     uint64_t aSig, bSig, zSig0, zSig1;
5268     int32_t expDiff;
5269 
5270     aSig = extractFloatx80Frac( a );
5271     aExp = extractFloatx80Exp( a );
5272     bSig = extractFloatx80Frac( b );
5273     bExp = extractFloatx80Exp( b );
5274     expDiff = aExp - bExp;
5275     if ( 0 < expDiff ) goto aExpBigger;
5276     if ( expDiff < 0 ) goto bExpBigger;
5277     if ( aExp == 0x7FFF ) {
5278         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5279             return propagateFloatx80NaN(a, b, status);
5280         }
5281         float_raise(float_flag_invalid, status);
5282         return floatx80_default_nan(status);
5283     }
5284     if ( aExp == 0 ) {
5285         aExp = 1;
5286         bExp = 1;
5287     }
5288     zSig1 = 0;
5289     if ( bSig < aSig ) goto aBigger;
5290     if ( aSig < bSig ) goto bBigger;
5291     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5292  bExpBigger:
5293     if ( bExp == 0x7FFF ) {
5294         if ((uint64_t)(bSig << 1)) {
5295             return propagateFloatx80NaN(a, b, status);
5296         }
5297         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5298     }
5299     if ( aExp == 0 ) ++expDiff;
5300     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5301  bBigger:
5302     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5303     zExp = bExp;
5304     zSign ^= 1;
5305     goto normalizeRoundAndPack;
5306  aExpBigger:
5307     if ( aExp == 0x7FFF ) {
5308         if ((uint64_t)(aSig << 1)) {
5309             return propagateFloatx80NaN(a, b, status);
5310         }
5311         return a;
5312     }
5313     if ( bExp == 0 ) --expDiff;
5314     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5315  aBigger:
5316     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5317     zExp = aExp;
5318  normalizeRoundAndPack:
5319     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5320                                          zSign, zExp, zSig0, zSig1, status);
5321 }
5322 
5323 /*----------------------------------------------------------------------------
5324 | Returns the result of adding the extended double-precision floating-point
5325 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5326 | Standard for Binary Floating-Point Arithmetic.
5327 *----------------------------------------------------------------------------*/
5328 
5329 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5330 {
5331     flag aSign, bSign;
5332 
5333     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5334         float_raise(float_flag_invalid, status);
5335         return floatx80_default_nan(status);
5336     }
5337     aSign = extractFloatx80Sign( a );
5338     bSign = extractFloatx80Sign( b );
5339     if ( aSign == bSign ) {
5340         return addFloatx80Sigs(a, b, aSign, status);
5341     }
5342     else {
5343         return subFloatx80Sigs(a, b, aSign, status);
5344     }
5345 
5346 }
5347 
5348 /*----------------------------------------------------------------------------
5349 | Returns the result of subtracting the extended double-precision floating-
5350 | point values `a' and `b'.  The operation is performed according to the
5351 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5352 *----------------------------------------------------------------------------*/
5353 
5354 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5355 {
5356     flag aSign, bSign;
5357 
5358     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5359         float_raise(float_flag_invalid, status);
5360         return floatx80_default_nan(status);
5361     }
5362     aSign = extractFloatx80Sign( a );
5363     bSign = extractFloatx80Sign( b );
5364     if ( aSign == bSign ) {
5365         return subFloatx80Sigs(a, b, aSign, status);
5366     }
5367     else {
5368         return addFloatx80Sigs(a, b, aSign, status);
5369     }
5370 
5371 }
5372 
5373 /*----------------------------------------------------------------------------
5374 | Returns the result of multiplying the extended double-precision floating-
5375 | point values `a' and `b'.  The operation is performed according to the
5376 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5377 *----------------------------------------------------------------------------*/
5378 
5379 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5380 {
5381     flag aSign, bSign, zSign;
5382     int32_t aExp, bExp, zExp;
5383     uint64_t aSig, bSig, zSig0, zSig1;
5384 
5385     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5386         float_raise(float_flag_invalid, status);
5387         return floatx80_default_nan(status);
5388     }
5389     aSig = extractFloatx80Frac( a );
5390     aExp = extractFloatx80Exp( a );
5391     aSign = extractFloatx80Sign( a );
5392     bSig = extractFloatx80Frac( b );
5393     bExp = extractFloatx80Exp( b );
5394     bSign = extractFloatx80Sign( b );
5395     zSign = aSign ^ bSign;
5396     if ( aExp == 0x7FFF ) {
5397         if (    (uint64_t) ( aSig<<1 )
5398              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5399             return propagateFloatx80NaN(a, b, status);
5400         }
5401         if ( ( bExp | bSig ) == 0 ) goto invalid;
5402         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5403     }
5404     if ( bExp == 0x7FFF ) {
5405         if ((uint64_t)(bSig << 1)) {
5406             return propagateFloatx80NaN(a, b, status);
5407         }
5408         if ( ( aExp | aSig ) == 0 ) {
5409  invalid:
5410             float_raise(float_flag_invalid, status);
5411             return floatx80_default_nan(status);
5412         }
5413         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5414     }
5415     if ( aExp == 0 ) {
5416         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5417         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5418     }
5419     if ( bExp == 0 ) {
5420         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5421         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5422     }
5423     zExp = aExp + bExp - 0x3FFE;
5424     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5425     if ( 0 < (int64_t) zSig0 ) {
5426         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5427         --zExp;
5428     }
5429     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5430                                 zSign, zExp, zSig0, zSig1, status);
5431 }
5432 
5433 /*----------------------------------------------------------------------------
5434 | Returns the result of dividing the extended double-precision floating-point
5435 | value `a' by the corresponding value `b'.  The operation is performed
5436 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5437 *----------------------------------------------------------------------------*/
5438 
5439 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5440 {
5441     flag aSign, bSign, zSign;
5442     int32_t aExp, bExp, zExp;
5443     uint64_t aSig, bSig, zSig0, zSig1;
5444     uint64_t rem0, rem1, rem2, term0, term1, term2;
5445 
5446     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5447         float_raise(float_flag_invalid, status);
5448         return floatx80_default_nan(status);
5449     }
5450     aSig = extractFloatx80Frac( a );
5451     aExp = extractFloatx80Exp( a );
5452     aSign = extractFloatx80Sign( a );
5453     bSig = extractFloatx80Frac( b );
5454     bExp = extractFloatx80Exp( b );
5455     bSign = extractFloatx80Sign( b );
5456     zSign = aSign ^ bSign;
5457     if ( aExp == 0x7FFF ) {
5458         if ((uint64_t)(aSig << 1)) {
5459             return propagateFloatx80NaN(a, b, status);
5460         }
5461         if ( bExp == 0x7FFF ) {
5462             if ((uint64_t)(bSig << 1)) {
5463                 return propagateFloatx80NaN(a, b, status);
5464             }
5465             goto invalid;
5466         }
5467         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5468     }
5469     if ( bExp == 0x7FFF ) {
5470         if ((uint64_t)(bSig << 1)) {
5471             return propagateFloatx80NaN(a, b, status);
5472         }
5473         return packFloatx80( zSign, 0, 0 );
5474     }
5475     if ( bExp == 0 ) {
5476         if ( bSig == 0 ) {
5477             if ( ( aExp | aSig ) == 0 ) {
5478  invalid:
5479                 float_raise(float_flag_invalid, status);
5480                 return floatx80_default_nan(status);
5481             }
5482             float_raise(float_flag_divbyzero, status);
5483             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484         }
5485         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5486     }
5487     if ( aExp == 0 ) {
5488         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5489         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5490     }
5491     zExp = aExp - bExp + 0x3FFE;
5492     rem1 = 0;
5493     if ( bSig <= aSig ) {
5494         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5495         ++zExp;
5496     }
5497     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5498     mul64To128( bSig, zSig0, &term0, &term1 );
5499     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5500     while ( (int64_t) rem0 < 0 ) {
5501         --zSig0;
5502         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5503     }
5504     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5505     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5506         mul64To128( bSig, zSig1, &term1, &term2 );
5507         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5508         while ( (int64_t) rem1 < 0 ) {
5509             --zSig1;
5510             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5511         }
5512         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5513     }
5514     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5515                                 zSign, zExp, zSig0, zSig1, status);
5516 }
5517 
5518 /*----------------------------------------------------------------------------
5519 | Returns the remainder of the extended double-precision floating-point value
5520 | `a' with respect to the corresponding value `b'.  The operation is performed
5521 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5522 *----------------------------------------------------------------------------*/
5523 
5524 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5525 {
5526     flag aSign, zSign;
5527     int32_t aExp, bExp, expDiff;
5528     uint64_t aSig0, aSig1, bSig;
5529     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5530 
5531     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5532         float_raise(float_flag_invalid, status);
5533         return floatx80_default_nan(status);
5534     }
5535     aSig0 = extractFloatx80Frac( a );
5536     aExp = extractFloatx80Exp( a );
5537     aSign = extractFloatx80Sign( a );
5538     bSig = extractFloatx80Frac( b );
5539     bExp = extractFloatx80Exp( b );
5540     if ( aExp == 0x7FFF ) {
5541         if (    (uint64_t) ( aSig0<<1 )
5542              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5543             return propagateFloatx80NaN(a, b, status);
5544         }
5545         goto invalid;
5546     }
5547     if ( bExp == 0x7FFF ) {
5548         if ((uint64_t)(bSig << 1)) {
5549             return propagateFloatx80NaN(a, b, status);
5550         }
5551         return a;
5552     }
5553     if ( bExp == 0 ) {
5554         if ( bSig == 0 ) {
5555  invalid:
5556             float_raise(float_flag_invalid, status);
5557             return floatx80_default_nan(status);
5558         }
5559         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5560     }
5561     if ( aExp == 0 ) {
5562         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5563         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5564     }
5565     bSig |= LIT64( 0x8000000000000000 );
5566     zSign = aSign;
5567     expDiff = aExp - bExp;
5568     aSig1 = 0;
5569     if ( expDiff < 0 ) {
5570         if ( expDiff < -1 ) return a;
5571         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5572         expDiff = 0;
5573     }
5574     q = ( bSig <= aSig0 );
5575     if ( q ) aSig0 -= bSig;
5576     expDiff -= 64;
5577     while ( 0 < expDiff ) {
5578         q = estimateDiv128To64( aSig0, aSig1, bSig );
5579         q = ( 2 < q ) ? q - 2 : 0;
5580         mul64To128( bSig, q, &term0, &term1 );
5581         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5582         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5583         expDiff -= 62;
5584     }
5585     expDiff += 64;
5586     if ( 0 < expDiff ) {
5587         q = estimateDiv128To64( aSig0, aSig1, bSig );
5588         q = ( 2 < q ) ? q - 2 : 0;
5589         q >>= 64 - expDiff;
5590         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5591         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5592         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5593         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5594             ++q;
5595             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5596         }
5597     }
5598     else {
5599         term1 = 0;
5600         term0 = bSig;
5601     }
5602     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5603     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5604          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5605               && ( q & 1 ) )
5606        ) {
5607         aSig0 = alternateASig0;
5608         aSig1 = alternateASig1;
5609         zSign = ! zSign;
5610     }
5611     return
5612         normalizeRoundAndPackFloatx80(
5613             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5614 
5615 }
5616 
5617 /*----------------------------------------------------------------------------
5618 | Returns the square root of the extended double-precision floating-point
5619 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5620 | for Binary Floating-Point Arithmetic.
5621 *----------------------------------------------------------------------------*/
5622 
5623 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5624 {
5625     flag aSign;
5626     int32_t aExp, zExp;
5627     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5628     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5629 
5630     if (floatx80_invalid_encoding(a)) {
5631         float_raise(float_flag_invalid, status);
5632         return floatx80_default_nan(status);
5633     }
5634     aSig0 = extractFloatx80Frac( a );
5635     aExp = extractFloatx80Exp( a );
5636     aSign = extractFloatx80Sign( a );
5637     if ( aExp == 0x7FFF ) {
5638         if ((uint64_t)(aSig0 << 1)) {
5639             return propagateFloatx80NaN(a, a, status);
5640         }
5641         if ( ! aSign ) return a;
5642         goto invalid;
5643     }
5644     if ( aSign ) {
5645         if ( ( aExp | aSig0 ) == 0 ) return a;
5646  invalid:
5647         float_raise(float_flag_invalid, status);
5648         return floatx80_default_nan(status);
5649     }
5650     if ( aExp == 0 ) {
5651         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5652         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5653     }
5654     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5655     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5656     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5657     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5658     doubleZSig0 = zSig0<<1;
5659     mul64To128( zSig0, zSig0, &term0, &term1 );
5660     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5661     while ( (int64_t) rem0 < 0 ) {
5662         --zSig0;
5663         doubleZSig0 -= 2;
5664         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5665     }
5666     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5667     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5668         if ( zSig1 == 0 ) zSig1 = 1;
5669         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5670         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5671         mul64To128( zSig1, zSig1, &term2, &term3 );
5672         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5673         while ( (int64_t) rem1 < 0 ) {
5674             --zSig1;
5675             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5676             term3 |= 1;
5677             term2 |= doubleZSig0;
5678             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5679         }
5680         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5681     }
5682     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5683     zSig0 |= doubleZSig0;
5684     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5685                                 0, zExp, zSig0, zSig1, status);
5686 }
5687 
5688 /*----------------------------------------------------------------------------
5689 | Returns 1 if the extended double-precision floating-point value `a' is equal
5690 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5691 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5692 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5693 *----------------------------------------------------------------------------*/
5694 
5695 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5696 {
5697 
5698     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5699         || (extractFloatx80Exp(a) == 0x7FFF
5700             && (uint64_t) (extractFloatx80Frac(a) << 1))
5701         || (extractFloatx80Exp(b) == 0x7FFF
5702             && (uint64_t) (extractFloatx80Frac(b) << 1))
5703        ) {
5704         float_raise(float_flag_invalid, status);
5705         return 0;
5706     }
5707     return
5708            ( a.low == b.low )
5709         && (    ( a.high == b.high )
5710              || (    ( a.low == 0 )
5711                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5712            );
5713 
5714 }
5715 
5716 /*----------------------------------------------------------------------------
5717 | Returns 1 if the extended double-precision floating-point value `a' is
5718 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5719 | invalid exception is raised if either operand is a NaN.  The comparison is
5720 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5721 | Arithmetic.
5722 *----------------------------------------------------------------------------*/
5723 
5724 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5725 {
5726     flag aSign, bSign;
5727 
5728     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5729         || (extractFloatx80Exp(a) == 0x7FFF
5730             && (uint64_t) (extractFloatx80Frac(a) << 1))
5731         || (extractFloatx80Exp(b) == 0x7FFF
5732             && (uint64_t) (extractFloatx80Frac(b) << 1))
5733        ) {
5734         float_raise(float_flag_invalid, status);
5735         return 0;
5736     }
5737     aSign = extractFloatx80Sign( a );
5738     bSign = extractFloatx80Sign( b );
5739     if ( aSign != bSign ) {
5740         return
5741                aSign
5742             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5743                  == 0 );
5744     }
5745     return
5746           aSign ? le128( b.high, b.low, a.high, a.low )
5747         : le128( a.high, a.low, b.high, b.low );
5748 
5749 }
5750 
5751 /*----------------------------------------------------------------------------
5752 | Returns 1 if the extended double-precision floating-point value `a' is
5753 | less than the corresponding value `b', and 0 otherwise.  The invalid
5754 | exception is raised if either operand is a NaN.  The comparison is performed
5755 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5756 *----------------------------------------------------------------------------*/
5757 
5758 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5759 {
5760     flag aSign, bSign;
5761 
5762     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5763         || (extractFloatx80Exp(a) == 0x7FFF
5764             && (uint64_t) (extractFloatx80Frac(a) << 1))
5765         || (extractFloatx80Exp(b) == 0x7FFF
5766             && (uint64_t) (extractFloatx80Frac(b) << 1))
5767        ) {
5768         float_raise(float_flag_invalid, status);
5769         return 0;
5770     }
5771     aSign = extractFloatx80Sign( a );
5772     bSign = extractFloatx80Sign( b );
5773     if ( aSign != bSign ) {
5774         return
5775                aSign
5776             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5777                  != 0 );
5778     }
5779     return
5780           aSign ? lt128( b.high, b.low, a.high, a.low )
5781         : lt128( a.high, a.low, b.high, b.low );
5782 
5783 }
5784 
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5787 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5788 | either operand is a NaN.   The comparison is performed according to the
5789 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5792 {
5793     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5794         || (extractFloatx80Exp(a) == 0x7FFF
5795             && (uint64_t) (extractFloatx80Frac(a) << 1))
5796         || (extractFloatx80Exp(b) == 0x7FFF
5797             && (uint64_t) (extractFloatx80Frac(b) << 1))
5798        ) {
5799         float_raise(float_flag_invalid, status);
5800         return 1;
5801     }
5802     return 0;
5803 }
5804 
5805 /*----------------------------------------------------------------------------
5806 | Returns 1 if the extended double-precision floating-point value `a' is
5807 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5808 | cause an exception.  The comparison is performed according to the IEC/IEEE
5809 | Standard for Binary Floating-Point Arithmetic.
5810 *----------------------------------------------------------------------------*/
5811 
5812 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5813 {
5814 
5815     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5816         float_raise(float_flag_invalid, status);
5817         return 0;
5818     }
5819     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5820               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5821          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5822               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5823        ) {
5824         if (floatx80_is_signaling_nan(a, status)
5825          || floatx80_is_signaling_nan(b, status)) {
5826             float_raise(float_flag_invalid, status);
5827         }
5828         return 0;
5829     }
5830     return
5831            ( a.low == b.low )
5832         && (    ( a.high == b.high )
5833              || (    ( a.low == 0 )
5834                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5835            );
5836 
5837 }
5838 
5839 /*----------------------------------------------------------------------------
5840 | Returns 1 if the extended double-precision floating-point value `a' is less
5841 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5842 | do not cause an exception.  Otherwise, the comparison is performed according
5843 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5844 *----------------------------------------------------------------------------*/
5845 
5846 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5847 {
5848     flag aSign, bSign;
5849 
5850     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5851         float_raise(float_flag_invalid, status);
5852         return 0;
5853     }
5854     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5855               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5856          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5857               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5858        ) {
5859         if (floatx80_is_signaling_nan(a, status)
5860          || floatx80_is_signaling_nan(b, status)) {
5861             float_raise(float_flag_invalid, status);
5862         }
5863         return 0;
5864     }
5865     aSign = extractFloatx80Sign( a );
5866     bSign = extractFloatx80Sign( b );
5867     if ( aSign != bSign ) {
5868         return
5869                aSign
5870             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5871                  == 0 );
5872     }
5873     return
5874           aSign ? le128( b.high, b.low, a.high, a.low )
5875         : le128( a.high, a.low, b.high, b.low );
5876 
5877 }
5878 
5879 /*----------------------------------------------------------------------------
5880 | Returns 1 if the extended double-precision floating-point value `a' is less
5881 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5882 | an exception.  Otherwise, the comparison is performed according to the
5883 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5884 *----------------------------------------------------------------------------*/
5885 
5886 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5887 {
5888     flag aSign, bSign;
5889 
5890     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5891         float_raise(float_flag_invalid, status);
5892         return 0;
5893     }
5894     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5895               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5896          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5897               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5898        ) {
5899         if (floatx80_is_signaling_nan(a, status)
5900          || floatx80_is_signaling_nan(b, status)) {
5901             float_raise(float_flag_invalid, status);
5902         }
5903         return 0;
5904     }
5905     aSign = extractFloatx80Sign( a );
5906     bSign = extractFloatx80Sign( b );
5907     if ( aSign != bSign ) {
5908         return
5909                aSign
5910             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5911                  != 0 );
5912     }
5913     return
5914           aSign ? lt128( b.high, b.low, a.high, a.low )
5915         : lt128( a.high, a.low, b.high, b.low );
5916 
5917 }
5918 
5919 /*----------------------------------------------------------------------------
5920 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5921 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5922 | The comparison is performed according to the IEC/IEEE Standard for Binary
5923 | Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5925 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5926 {
5927     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5928         float_raise(float_flag_invalid, status);
5929         return 1;
5930     }
5931     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5932               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5933          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5934               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5935        ) {
5936         if (floatx80_is_signaling_nan(a, status)
5937          || floatx80_is_signaling_nan(b, status)) {
5938             float_raise(float_flag_invalid, status);
5939         }
5940         return 1;
5941     }
5942     return 0;
5943 }
5944 
5945 /*----------------------------------------------------------------------------
5946 | Returns the result of converting the quadruple-precision floating-point
5947 | value `a' to the 32-bit two's complement integer format.  The conversion
5948 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5949 | Arithmetic---which means in particular that the conversion is rounded
5950 | according to the current rounding mode.  If `a' is a NaN, the largest
5951 | positive integer is returned.  Otherwise, if the conversion overflows, the
5952 | largest integer with the same sign as `a' is returned.
5953 *----------------------------------------------------------------------------*/
5954 
5955 int32_t float128_to_int32(float128 a, float_status *status)
5956 {
5957     flag aSign;
5958     int32_t aExp, shiftCount;
5959     uint64_t aSig0, aSig1;
5960 
5961     aSig1 = extractFloat128Frac1( a );
5962     aSig0 = extractFloat128Frac0( a );
5963     aExp = extractFloat128Exp( a );
5964     aSign = extractFloat128Sign( a );
5965     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5966     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5967     aSig0 |= ( aSig1 != 0 );
5968     shiftCount = 0x4028 - aExp;
5969     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5970     return roundAndPackInt32(aSign, aSig0, status);
5971 
5972 }
5973 
5974 /*----------------------------------------------------------------------------
5975 | Returns the result of converting the quadruple-precision floating-point
5976 | value `a' to the 32-bit two's complement integer format.  The conversion
5977 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5978 | Arithmetic, except that the conversion is always rounded toward zero.  If
5979 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5980 | conversion overflows, the largest integer with the same sign as `a' is
5981 | returned.
5982 *----------------------------------------------------------------------------*/
5983 
5984 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
5985 {
5986     flag aSign;
5987     int32_t aExp, shiftCount;
5988     uint64_t aSig0, aSig1, savedASig;
5989     int32_t z;
5990 
5991     aSig1 = extractFloat128Frac1( a );
5992     aSig0 = extractFloat128Frac0( a );
5993     aExp = extractFloat128Exp( a );
5994     aSign = extractFloat128Sign( a );
5995     aSig0 |= ( aSig1 != 0 );
5996     if ( 0x401E < aExp ) {
5997         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5998         goto invalid;
5999     }
6000     else if ( aExp < 0x3FFF ) {
6001         if (aExp || aSig0) {
6002             status->float_exception_flags |= float_flag_inexact;
6003         }
6004         return 0;
6005     }
6006     aSig0 |= LIT64( 0x0001000000000000 );
6007     shiftCount = 0x402F - aExp;
6008     savedASig = aSig0;
6009     aSig0 >>= shiftCount;
6010     z = aSig0;
6011     if ( aSign ) z = - z;
6012     if ( ( z < 0 ) ^ aSign ) {
6013  invalid:
6014         float_raise(float_flag_invalid, status);
6015         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6016     }
6017     if ( ( aSig0<<shiftCount ) != savedASig ) {
6018         status->float_exception_flags |= float_flag_inexact;
6019     }
6020     return z;
6021 
6022 }
6023 
6024 /*----------------------------------------------------------------------------
6025 | Returns the result of converting the quadruple-precision floating-point
6026 | value `a' to the 64-bit two's complement integer format.  The conversion
6027 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6028 | Arithmetic---which means in particular that the conversion is rounded
6029 | according to the current rounding mode.  If `a' is a NaN, the largest
6030 | positive integer is returned.  Otherwise, if the conversion overflows, the
6031 | largest integer with the same sign as `a' is returned.
6032 *----------------------------------------------------------------------------*/
6033 
6034 int64_t float128_to_int64(float128 a, float_status *status)
6035 {
6036     flag aSign;
6037     int32_t aExp, shiftCount;
6038     uint64_t aSig0, aSig1;
6039 
6040     aSig1 = extractFloat128Frac1( a );
6041     aSig0 = extractFloat128Frac0( a );
6042     aExp = extractFloat128Exp( a );
6043     aSign = extractFloat128Sign( a );
6044     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6045     shiftCount = 0x402F - aExp;
6046     if ( shiftCount <= 0 ) {
6047         if ( 0x403E < aExp ) {
6048             float_raise(float_flag_invalid, status);
6049             if (    ! aSign
6050                  || (    ( aExp == 0x7FFF )
6051                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6052                     )
6053                ) {
6054                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6055             }
6056             return (int64_t) LIT64( 0x8000000000000000 );
6057         }
6058         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6059     }
6060     else {
6061         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6062     }
6063     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6064 
6065 }
6066 
6067 /*----------------------------------------------------------------------------
6068 | Returns the result of converting the quadruple-precision floating-point
6069 | value `a' to the 64-bit two's complement integer format.  The conversion
6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6071 | Arithmetic, except that the conversion is always rounded toward zero.
6072 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6073 | the conversion overflows, the largest integer with the same sign as `a' is
6074 | returned.
6075 *----------------------------------------------------------------------------*/
6076 
6077 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6078 {
6079     flag aSign;
6080     int32_t aExp, shiftCount;
6081     uint64_t aSig0, aSig1;
6082     int64_t z;
6083 
6084     aSig1 = extractFloat128Frac1( a );
6085     aSig0 = extractFloat128Frac0( a );
6086     aExp = extractFloat128Exp( a );
6087     aSign = extractFloat128Sign( a );
6088     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6089     shiftCount = aExp - 0x402F;
6090     if ( 0 < shiftCount ) {
6091         if ( 0x403E <= aExp ) {
6092             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6093             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6094                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6095                 if (aSig1) {
6096                     status->float_exception_flags |= float_flag_inexact;
6097                 }
6098             }
6099             else {
6100                 float_raise(float_flag_invalid, status);
6101                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6102                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6103                 }
6104             }
6105             return (int64_t) LIT64( 0x8000000000000000 );
6106         }
6107         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6108         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6109             status->float_exception_flags |= float_flag_inexact;
6110         }
6111     }
6112     else {
6113         if ( aExp < 0x3FFF ) {
6114             if ( aExp | aSig0 | aSig1 ) {
6115                 status->float_exception_flags |= float_flag_inexact;
6116             }
6117             return 0;
6118         }
6119         z = aSig0>>( - shiftCount );
6120         if (    aSig1
6121              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6122             status->float_exception_flags |= float_flag_inexact;
6123         }
6124     }
6125     if ( aSign ) z = - z;
6126     return z;
6127 
6128 }
6129 
6130 /*----------------------------------------------------------------------------
6131 | Returns the result of converting the quadruple-precision floating-point value
6132 | `a' to the 64-bit unsigned integer format.  The conversion is
6133 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6134 | Arithmetic---which means in particular that the conversion is rounded
6135 | according to the current rounding mode.  If `a' is a NaN, the largest
6136 | positive integer is returned.  If the conversion overflows, the
6137 | largest unsigned integer is returned.  If 'a' is negative, the value is
6138 | rounded and zero is returned; negative values that do not round to zero
6139 | will raise the inexact exception.
6140 *----------------------------------------------------------------------------*/
6141 
6142 uint64_t float128_to_uint64(float128 a, float_status *status)
6143 {
6144     flag aSign;
6145     int aExp;
6146     int shiftCount;
6147     uint64_t aSig0, aSig1;
6148 
6149     aSig0 = extractFloat128Frac0(a);
6150     aSig1 = extractFloat128Frac1(a);
6151     aExp = extractFloat128Exp(a);
6152     aSign = extractFloat128Sign(a);
6153     if (aSign && (aExp > 0x3FFE)) {
6154         float_raise(float_flag_invalid, status);
6155         if (float128_is_any_nan(a)) {
6156             return LIT64(0xFFFFFFFFFFFFFFFF);
6157         } else {
6158             return 0;
6159         }
6160     }
6161     if (aExp) {
6162         aSig0 |= LIT64(0x0001000000000000);
6163     }
6164     shiftCount = 0x402F - aExp;
6165     if (shiftCount <= 0) {
6166         if (0x403E < aExp) {
6167             float_raise(float_flag_invalid, status);
6168             return LIT64(0xFFFFFFFFFFFFFFFF);
6169         }
6170         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6171     } else {
6172         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6173     }
6174     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6175 }
6176 
6177 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6178 {
6179     uint64_t v;
6180     signed char current_rounding_mode = status->float_rounding_mode;
6181 
6182     set_float_rounding_mode(float_round_to_zero, status);
6183     v = float128_to_uint64(a, status);
6184     set_float_rounding_mode(current_rounding_mode, status);
6185 
6186     return v;
6187 }
6188 
6189 /*----------------------------------------------------------------------------
6190 | Returns the result of converting the quadruple-precision floating-point
6191 | value `a' to the 32-bit unsigned integer format.  The conversion
6192 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6193 | Arithmetic except that the conversion is always rounded toward zero.
6194 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6195 | if the conversion overflows, the largest unsigned integer is returned.
6196 | If 'a' is negative, the value is rounded and zero is returned; negative
6197 | values that do not round to zero will raise the inexact exception.
6198 *----------------------------------------------------------------------------*/
6199 
6200 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6201 {
6202     uint64_t v;
6203     uint32_t res;
6204     int old_exc_flags = get_float_exception_flags(status);
6205 
6206     v = float128_to_uint64_round_to_zero(a, status);
6207     if (v > 0xffffffff) {
6208         res = 0xffffffff;
6209     } else {
6210         return v;
6211     }
6212     set_float_exception_flags(old_exc_flags, status);
6213     float_raise(float_flag_invalid, status);
6214     return res;
6215 }
6216 
6217 /*----------------------------------------------------------------------------
6218 | Returns the result of converting the quadruple-precision floating-point
6219 | value `a' to the single-precision floating-point format.  The conversion
6220 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6221 | Arithmetic.
6222 *----------------------------------------------------------------------------*/
6223 
6224 float32 float128_to_float32(float128 a, float_status *status)
6225 {
6226     flag aSign;
6227     int32_t aExp;
6228     uint64_t aSig0, aSig1;
6229     uint32_t zSig;
6230 
6231     aSig1 = extractFloat128Frac1( a );
6232     aSig0 = extractFloat128Frac0( a );
6233     aExp = extractFloat128Exp( a );
6234     aSign = extractFloat128Sign( a );
6235     if ( aExp == 0x7FFF ) {
6236         if ( aSig0 | aSig1 ) {
6237             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6238         }
6239         return packFloat32( aSign, 0xFF, 0 );
6240     }
6241     aSig0 |= ( aSig1 != 0 );
6242     shift64RightJamming( aSig0, 18, &aSig0 );
6243     zSig = aSig0;
6244     if ( aExp || zSig ) {
6245         zSig |= 0x40000000;
6246         aExp -= 0x3F81;
6247     }
6248     return roundAndPackFloat32(aSign, aExp, zSig, status);
6249 
6250 }
6251 
6252 /*----------------------------------------------------------------------------
6253 | Returns the result of converting the quadruple-precision floating-point
6254 | value `a' to the double-precision floating-point format.  The conversion
6255 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6256 | Arithmetic.
6257 *----------------------------------------------------------------------------*/
6258 
6259 float64 float128_to_float64(float128 a, float_status *status)
6260 {
6261     flag aSign;
6262     int32_t aExp;
6263     uint64_t aSig0, aSig1;
6264 
6265     aSig1 = extractFloat128Frac1( a );
6266     aSig0 = extractFloat128Frac0( a );
6267     aExp = extractFloat128Exp( a );
6268     aSign = extractFloat128Sign( a );
6269     if ( aExp == 0x7FFF ) {
6270         if ( aSig0 | aSig1 ) {
6271             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6272         }
6273         return packFloat64( aSign, 0x7FF, 0 );
6274     }
6275     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6276     aSig0 |= ( aSig1 != 0 );
6277     if ( aExp || aSig0 ) {
6278         aSig0 |= LIT64( 0x4000000000000000 );
6279         aExp -= 0x3C01;
6280     }
6281     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6282 
6283 }
6284 
6285 /*----------------------------------------------------------------------------
6286 | Returns the result of converting the quadruple-precision floating-point
6287 | value `a' to the extended double-precision floating-point format.  The
6288 | conversion is performed according to the IEC/IEEE Standard for Binary
6289 | Floating-Point Arithmetic.
6290 *----------------------------------------------------------------------------*/
6291 
6292 floatx80 float128_to_floatx80(float128 a, float_status *status)
6293 {
6294     flag aSign;
6295     int32_t aExp;
6296     uint64_t aSig0, aSig1;
6297 
6298     aSig1 = extractFloat128Frac1( a );
6299     aSig0 = extractFloat128Frac0( a );
6300     aExp = extractFloat128Exp( a );
6301     aSign = extractFloat128Sign( a );
6302     if ( aExp == 0x7FFF ) {
6303         if ( aSig0 | aSig1 ) {
6304             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6305         }
6306         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6307     }
6308     if ( aExp == 0 ) {
6309         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6310         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6311     }
6312     else {
6313         aSig0 |= LIT64( 0x0001000000000000 );
6314     }
6315     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6316     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6317 
6318 }
6319 
6320 /*----------------------------------------------------------------------------
6321 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6322 | returns the result as a quadruple-precision floating-point value.  The
6323 | operation is performed according to the IEC/IEEE Standard for Binary
6324 | Floating-Point Arithmetic.
6325 *----------------------------------------------------------------------------*/
6326 
6327 float128 float128_round_to_int(float128 a, float_status *status)
6328 {
6329     flag aSign;
6330     int32_t aExp;
6331     uint64_t lastBitMask, roundBitsMask;
6332     float128 z;
6333 
6334     aExp = extractFloat128Exp( a );
6335     if ( 0x402F <= aExp ) {
6336         if ( 0x406F <= aExp ) {
6337             if (    ( aExp == 0x7FFF )
6338                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6339                ) {
6340                 return propagateFloat128NaN(a, a, status);
6341             }
6342             return a;
6343         }
6344         lastBitMask = 1;
6345         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6346         roundBitsMask = lastBitMask - 1;
6347         z = a;
6348         switch (status->float_rounding_mode) {
6349         case float_round_nearest_even:
6350             if ( lastBitMask ) {
6351                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6352                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6353             }
6354             else {
6355                 if ( (int64_t) z.low < 0 ) {
6356                     ++z.high;
6357                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6358                 }
6359             }
6360             break;
6361         case float_round_ties_away:
6362             if (lastBitMask) {
6363                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6364             } else {
6365                 if ((int64_t) z.low < 0) {
6366                     ++z.high;
6367                 }
6368             }
6369             break;
6370         case float_round_to_zero:
6371             break;
6372         case float_round_up:
6373             if (!extractFloat128Sign(z)) {
6374                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6375             }
6376             break;
6377         case float_round_down:
6378             if (extractFloat128Sign(z)) {
6379                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6380             }
6381             break;
6382         default:
6383             abort();
6384         }
6385         z.low &= ~ roundBitsMask;
6386     }
6387     else {
6388         if ( aExp < 0x3FFF ) {
6389             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6390             status->float_exception_flags |= float_flag_inexact;
6391             aSign = extractFloat128Sign( a );
6392             switch (status->float_rounding_mode) {
6393              case float_round_nearest_even:
6394                 if (    ( aExp == 0x3FFE )
6395                      && (   extractFloat128Frac0( a )
6396                           | extractFloat128Frac1( a ) )
6397                    ) {
6398                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6399                 }
6400                 break;
6401             case float_round_ties_away:
6402                 if (aExp == 0x3FFE) {
6403                     return packFloat128(aSign, 0x3FFF, 0, 0);
6404                 }
6405                 break;
6406              case float_round_down:
6407                 return
6408                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6409                     : packFloat128( 0, 0, 0, 0 );
6410              case float_round_up:
6411                 return
6412                       aSign ? packFloat128( 1, 0, 0, 0 )
6413                     : packFloat128( 0, 0x3FFF, 0, 0 );
6414             }
6415             return packFloat128( aSign, 0, 0, 0 );
6416         }
6417         lastBitMask = 1;
6418         lastBitMask <<= 0x402F - aExp;
6419         roundBitsMask = lastBitMask - 1;
6420         z.low = 0;
6421         z.high = a.high;
6422         switch (status->float_rounding_mode) {
6423         case float_round_nearest_even:
6424             z.high += lastBitMask>>1;
6425             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6426                 z.high &= ~ lastBitMask;
6427             }
6428             break;
6429         case float_round_ties_away:
6430             z.high += lastBitMask>>1;
6431             break;
6432         case float_round_to_zero:
6433             break;
6434         case float_round_up:
6435             if (!extractFloat128Sign(z)) {
6436                 z.high |= ( a.low != 0 );
6437                 z.high += roundBitsMask;
6438             }
6439             break;
6440         case float_round_down:
6441             if (extractFloat128Sign(z)) {
6442                 z.high |= (a.low != 0);
6443                 z.high += roundBitsMask;
6444             }
6445             break;
6446         default:
6447             abort();
6448         }
6449         z.high &= ~ roundBitsMask;
6450     }
6451     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6452         status->float_exception_flags |= float_flag_inexact;
6453     }
6454     return z;
6455 
6456 }
6457 
6458 /*----------------------------------------------------------------------------
6459 | Returns the result of adding the absolute values of the quadruple-precision
6460 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6461 | before being returned.  `zSign' is ignored if the result is a NaN.
6462 | The addition is performed according to the IEC/IEEE Standard for Binary
6463 | Floating-Point Arithmetic.
6464 *----------------------------------------------------------------------------*/
6465 
6466 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6467                                 float_status *status)
6468 {
6469     int32_t aExp, bExp, zExp;
6470     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6471     int32_t expDiff;
6472 
6473     aSig1 = extractFloat128Frac1( a );
6474     aSig0 = extractFloat128Frac0( a );
6475     aExp = extractFloat128Exp( a );
6476     bSig1 = extractFloat128Frac1( b );
6477     bSig0 = extractFloat128Frac0( b );
6478     bExp = extractFloat128Exp( b );
6479     expDiff = aExp - bExp;
6480     if ( 0 < expDiff ) {
6481         if ( aExp == 0x7FFF ) {
6482             if (aSig0 | aSig1) {
6483                 return propagateFloat128NaN(a, b, status);
6484             }
6485             return a;
6486         }
6487         if ( bExp == 0 ) {
6488             --expDiff;
6489         }
6490         else {
6491             bSig0 |= LIT64( 0x0001000000000000 );
6492         }
6493         shift128ExtraRightJamming(
6494             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6495         zExp = aExp;
6496     }
6497     else if ( expDiff < 0 ) {
6498         if ( bExp == 0x7FFF ) {
6499             if (bSig0 | bSig1) {
6500                 return propagateFloat128NaN(a, b, status);
6501             }
6502             return packFloat128( zSign, 0x7FFF, 0, 0 );
6503         }
6504         if ( aExp == 0 ) {
6505             ++expDiff;
6506         }
6507         else {
6508             aSig0 |= LIT64( 0x0001000000000000 );
6509         }
6510         shift128ExtraRightJamming(
6511             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6512         zExp = bExp;
6513     }
6514     else {
6515         if ( aExp == 0x7FFF ) {
6516             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6517                 return propagateFloat128NaN(a, b, status);
6518             }
6519             return a;
6520         }
6521         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6522         if ( aExp == 0 ) {
6523             if (status->flush_to_zero) {
6524                 if (zSig0 | zSig1) {
6525                     float_raise(float_flag_output_denormal, status);
6526                 }
6527                 return packFloat128(zSign, 0, 0, 0);
6528             }
6529             return packFloat128( zSign, 0, zSig0, zSig1 );
6530         }
6531         zSig2 = 0;
6532         zSig0 |= LIT64( 0x0002000000000000 );
6533         zExp = aExp;
6534         goto shiftRight1;
6535     }
6536     aSig0 |= LIT64( 0x0001000000000000 );
6537     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6538     --zExp;
6539     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6540     ++zExp;
6541  shiftRight1:
6542     shift128ExtraRightJamming(
6543         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6544  roundAndPack:
6545     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6546 
6547 }
6548 
6549 /*----------------------------------------------------------------------------
6550 | Returns the result of subtracting the absolute values of the quadruple-
6551 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6552 | difference is negated before being returned.  `zSign' is ignored if the
6553 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6554 | Standard for Binary Floating-Point Arithmetic.
6555 *----------------------------------------------------------------------------*/
6556 
6557 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6558                                 float_status *status)
6559 {
6560     int32_t aExp, bExp, zExp;
6561     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6562     int32_t expDiff;
6563 
6564     aSig1 = extractFloat128Frac1( a );
6565     aSig0 = extractFloat128Frac0( a );
6566     aExp = extractFloat128Exp( a );
6567     bSig1 = extractFloat128Frac1( b );
6568     bSig0 = extractFloat128Frac0( b );
6569     bExp = extractFloat128Exp( b );
6570     expDiff = aExp - bExp;
6571     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6572     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6573     if ( 0 < expDiff ) goto aExpBigger;
6574     if ( expDiff < 0 ) goto bExpBigger;
6575     if ( aExp == 0x7FFF ) {
6576         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6577             return propagateFloat128NaN(a, b, status);
6578         }
6579         float_raise(float_flag_invalid, status);
6580         return float128_default_nan(status);
6581     }
6582     if ( aExp == 0 ) {
6583         aExp = 1;
6584         bExp = 1;
6585     }
6586     if ( bSig0 < aSig0 ) goto aBigger;
6587     if ( aSig0 < bSig0 ) goto bBigger;
6588     if ( bSig1 < aSig1 ) goto aBigger;
6589     if ( aSig1 < bSig1 ) goto bBigger;
6590     return packFloat128(status->float_rounding_mode == float_round_down,
6591                         0, 0, 0);
6592  bExpBigger:
6593     if ( bExp == 0x7FFF ) {
6594         if (bSig0 | bSig1) {
6595             return propagateFloat128NaN(a, b, status);
6596         }
6597         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6598     }
6599     if ( aExp == 0 ) {
6600         ++expDiff;
6601     }
6602     else {
6603         aSig0 |= LIT64( 0x4000000000000000 );
6604     }
6605     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6606     bSig0 |= LIT64( 0x4000000000000000 );
6607  bBigger:
6608     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6609     zExp = bExp;
6610     zSign ^= 1;
6611     goto normalizeRoundAndPack;
6612  aExpBigger:
6613     if ( aExp == 0x7FFF ) {
6614         if (aSig0 | aSig1) {
6615             return propagateFloat128NaN(a, b, status);
6616         }
6617         return a;
6618     }
6619     if ( bExp == 0 ) {
6620         --expDiff;
6621     }
6622     else {
6623         bSig0 |= LIT64( 0x4000000000000000 );
6624     }
6625     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6626     aSig0 |= LIT64( 0x4000000000000000 );
6627  aBigger:
6628     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6629     zExp = aExp;
6630  normalizeRoundAndPack:
6631     --zExp;
6632     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6633                                          status);
6634 
6635 }
6636 
6637 /*----------------------------------------------------------------------------
6638 | Returns the result of adding the quadruple-precision floating-point values
6639 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6640 | for Binary Floating-Point Arithmetic.
6641 *----------------------------------------------------------------------------*/
6642 
6643 float128 float128_add(float128 a, float128 b, float_status *status)
6644 {
6645     flag aSign, bSign;
6646 
6647     aSign = extractFloat128Sign( a );
6648     bSign = extractFloat128Sign( b );
6649     if ( aSign == bSign ) {
6650         return addFloat128Sigs(a, b, aSign, status);
6651     }
6652     else {
6653         return subFloat128Sigs(a, b, aSign, status);
6654     }
6655 
6656 }
6657 
6658 /*----------------------------------------------------------------------------
6659 | Returns the result of subtracting the quadruple-precision floating-point
6660 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6661 | Standard for Binary Floating-Point Arithmetic.
6662 *----------------------------------------------------------------------------*/
6663 
6664 float128 float128_sub(float128 a, float128 b, float_status *status)
6665 {
6666     flag aSign, bSign;
6667 
6668     aSign = extractFloat128Sign( a );
6669     bSign = extractFloat128Sign( b );
6670     if ( aSign == bSign ) {
6671         return subFloat128Sigs(a, b, aSign, status);
6672     }
6673     else {
6674         return addFloat128Sigs(a, b, aSign, status);
6675     }
6676 
6677 }
6678 
6679 /*----------------------------------------------------------------------------
6680 | Returns the result of multiplying the quadruple-precision floating-point
6681 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6682 | Standard for Binary Floating-Point Arithmetic.
6683 *----------------------------------------------------------------------------*/
6684 
6685 float128 float128_mul(float128 a, float128 b, float_status *status)
6686 {
6687     flag aSign, bSign, zSign;
6688     int32_t aExp, bExp, zExp;
6689     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6690 
6691     aSig1 = extractFloat128Frac1( a );
6692     aSig0 = extractFloat128Frac0( a );
6693     aExp = extractFloat128Exp( a );
6694     aSign = extractFloat128Sign( a );
6695     bSig1 = extractFloat128Frac1( b );
6696     bSig0 = extractFloat128Frac0( b );
6697     bExp = extractFloat128Exp( b );
6698     bSign = extractFloat128Sign( b );
6699     zSign = aSign ^ bSign;
6700     if ( aExp == 0x7FFF ) {
6701         if (    ( aSig0 | aSig1 )
6702              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6703             return propagateFloat128NaN(a, b, status);
6704         }
6705         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6706         return packFloat128( zSign, 0x7FFF, 0, 0 );
6707     }
6708     if ( bExp == 0x7FFF ) {
6709         if (bSig0 | bSig1) {
6710             return propagateFloat128NaN(a, b, status);
6711         }
6712         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6713  invalid:
6714             float_raise(float_flag_invalid, status);
6715             return float128_default_nan(status);
6716         }
6717         return packFloat128( zSign, 0x7FFF, 0, 0 );
6718     }
6719     if ( aExp == 0 ) {
6720         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6721         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6722     }
6723     if ( bExp == 0 ) {
6724         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6725         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6726     }
6727     zExp = aExp + bExp - 0x4000;
6728     aSig0 |= LIT64( 0x0001000000000000 );
6729     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6730     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6731     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6732     zSig2 |= ( zSig3 != 0 );
6733     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6734         shift128ExtraRightJamming(
6735             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6736         ++zExp;
6737     }
6738     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6739 
6740 }
6741 
6742 /*----------------------------------------------------------------------------
6743 | Returns the result of dividing the quadruple-precision floating-point value
6744 | `a' by the corresponding value `b'.  The operation is performed according to
6745 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6746 *----------------------------------------------------------------------------*/
6747 
6748 float128 float128_div(float128 a, float128 b, float_status *status)
6749 {
6750     flag aSign, bSign, zSign;
6751     int32_t aExp, bExp, zExp;
6752     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6753     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6754 
6755     aSig1 = extractFloat128Frac1( a );
6756     aSig0 = extractFloat128Frac0( a );
6757     aExp = extractFloat128Exp( a );
6758     aSign = extractFloat128Sign( a );
6759     bSig1 = extractFloat128Frac1( b );
6760     bSig0 = extractFloat128Frac0( b );
6761     bExp = extractFloat128Exp( b );
6762     bSign = extractFloat128Sign( b );
6763     zSign = aSign ^ bSign;
6764     if ( aExp == 0x7FFF ) {
6765         if (aSig0 | aSig1) {
6766             return propagateFloat128NaN(a, b, status);
6767         }
6768         if ( bExp == 0x7FFF ) {
6769             if (bSig0 | bSig1) {
6770                 return propagateFloat128NaN(a, b, status);
6771             }
6772             goto invalid;
6773         }
6774         return packFloat128( zSign, 0x7FFF, 0, 0 );
6775     }
6776     if ( bExp == 0x7FFF ) {
6777         if (bSig0 | bSig1) {
6778             return propagateFloat128NaN(a, b, status);
6779         }
6780         return packFloat128( zSign, 0, 0, 0 );
6781     }
6782     if ( bExp == 0 ) {
6783         if ( ( bSig0 | bSig1 ) == 0 ) {
6784             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6785  invalid:
6786                 float_raise(float_flag_invalid, status);
6787                 return float128_default_nan(status);
6788             }
6789             float_raise(float_flag_divbyzero, status);
6790             return packFloat128( zSign, 0x7FFF, 0, 0 );
6791         }
6792         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6793     }
6794     if ( aExp == 0 ) {
6795         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6796         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6797     }
6798     zExp = aExp - bExp + 0x3FFD;
6799     shortShift128Left(
6800         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6801     shortShift128Left(
6802         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6803     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6804         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6805         ++zExp;
6806     }
6807     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6808     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6809     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6810     while ( (int64_t) rem0 < 0 ) {
6811         --zSig0;
6812         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6813     }
6814     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6815     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6816         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6817         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6818         while ( (int64_t) rem1 < 0 ) {
6819             --zSig1;
6820             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6821         }
6822         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6823     }
6824     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6825     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6826 
6827 }
6828 
6829 /*----------------------------------------------------------------------------
6830 | Returns the remainder of the quadruple-precision floating-point value `a'
6831 | with respect to the corresponding value `b'.  The operation is performed
6832 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6833 *----------------------------------------------------------------------------*/
6834 
6835 float128 float128_rem(float128 a, float128 b, float_status *status)
6836 {
6837     flag aSign, zSign;
6838     int32_t aExp, bExp, expDiff;
6839     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6840     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6841     int64_t sigMean0;
6842 
6843     aSig1 = extractFloat128Frac1( a );
6844     aSig0 = extractFloat128Frac0( a );
6845     aExp = extractFloat128Exp( a );
6846     aSign = extractFloat128Sign( a );
6847     bSig1 = extractFloat128Frac1( b );
6848     bSig0 = extractFloat128Frac0( b );
6849     bExp = extractFloat128Exp( b );
6850     if ( aExp == 0x7FFF ) {
6851         if (    ( aSig0 | aSig1 )
6852              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6853             return propagateFloat128NaN(a, b, status);
6854         }
6855         goto invalid;
6856     }
6857     if ( bExp == 0x7FFF ) {
6858         if (bSig0 | bSig1) {
6859             return propagateFloat128NaN(a, b, status);
6860         }
6861         return a;
6862     }
6863     if ( bExp == 0 ) {
6864         if ( ( bSig0 | bSig1 ) == 0 ) {
6865  invalid:
6866             float_raise(float_flag_invalid, status);
6867             return float128_default_nan(status);
6868         }
6869         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6870     }
6871     if ( aExp == 0 ) {
6872         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6873         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6874     }
6875     expDiff = aExp - bExp;
6876     if ( expDiff < -1 ) return a;
6877     shortShift128Left(
6878         aSig0 | LIT64( 0x0001000000000000 ),
6879         aSig1,
6880         15 - ( expDiff < 0 ),
6881         &aSig0,
6882         &aSig1
6883     );
6884     shortShift128Left(
6885         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6886     q = le128( bSig0, bSig1, aSig0, aSig1 );
6887     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6888     expDiff -= 64;
6889     while ( 0 < expDiff ) {
6890         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6891         q = ( 4 < q ) ? q - 4 : 0;
6892         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6893         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6894         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6895         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6896         expDiff -= 61;
6897     }
6898     if ( -64 < expDiff ) {
6899         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6900         q = ( 4 < q ) ? q - 4 : 0;
6901         q >>= - expDiff;
6902         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6903         expDiff += 52;
6904         if ( expDiff < 0 ) {
6905             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6906         }
6907         else {
6908             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6909         }
6910         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6911         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6912     }
6913     else {
6914         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6915         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6916     }
6917     do {
6918         alternateASig0 = aSig0;
6919         alternateASig1 = aSig1;
6920         ++q;
6921         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6922     } while ( 0 <= (int64_t) aSig0 );
6923     add128(
6924         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6925     if (    ( sigMean0 < 0 )
6926          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6927         aSig0 = alternateASig0;
6928         aSig1 = alternateASig1;
6929     }
6930     zSign = ( (int64_t) aSig0 < 0 );
6931     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6932     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6933                                          status);
6934 }
6935 
6936 /*----------------------------------------------------------------------------
6937 | Returns the square root of the quadruple-precision floating-point value `a'.
6938 | The operation is performed according to the IEC/IEEE Standard for Binary
6939 | Floating-Point Arithmetic.
6940 *----------------------------------------------------------------------------*/
6941 
6942 float128 float128_sqrt(float128 a, float_status *status)
6943 {
6944     flag aSign;
6945     int32_t aExp, zExp;
6946     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6947     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6948 
6949     aSig1 = extractFloat128Frac1( a );
6950     aSig0 = extractFloat128Frac0( a );
6951     aExp = extractFloat128Exp( a );
6952     aSign = extractFloat128Sign( a );
6953     if ( aExp == 0x7FFF ) {
6954         if (aSig0 | aSig1) {
6955             return propagateFloat128NaN(a, a, status);
6956         }
6957         if ( ! aSign ) return a;
6958         goto invalid;
6959     }
6960     if ( aSign ) {
6961         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6962  invalid:
6963         float_raise(float_flag_invalid, status);
6964         return float128_default_nan(status);
6965     }
6966     if ( aExp == 0 ) {
6967         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6968         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6969     }
6970     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6971     aSig0 |= LIT64( 0x0001000000000000 );
6972     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6973     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6974     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6975     doubleZSig0 = zSig0<<1;
6976     mul64To128( zSig0, zSig0, &term0, &term1 );
6977     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6978     while ( (int64_t) rem0 < 0 ) {
6979         --zSig0;
6980         doubleZSig0 -= 2;
6981         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6982     }
6983     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6984     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6985         if ( zSig1 == 0 ) zSig1 = 1;
6986         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6987         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6988         mul64To128( zSig1, zSig1, &term2, &term3 );
6989         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6990         while ( (int64_t) rem1 < 0 ) {
6991             --zSig1;
6992             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6993             term3 |= 1;
6994             term2 |= doubleZSig0;
6995             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6996         }
6997         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6998     }
6999     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7000     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7001 
7002 }
7003 
7004 /*----------------------------------------------------------------------------
7005 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7006 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7007 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7008 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7009 *----------------------------------------------------------------------------*/
7010 
7011 int float128_eq(float128 a, float128 b, float_status *status)
7012 {
7013 
7014     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7015               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7016          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7017               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7018        ) {
7019         float_raise(float_flag_invalid, status);
7020         return 0;
7021     }
7022     return
7023            ( a.low == b.low )
7024         && (    ( a.high == b.high )
7025              || (    ( a.low == 0 )
7026                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7027            );
7028 
7029 }
7030 
7031 /*----------------------------------------------------------------------------
7032 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7033 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7034 | exception is raised if either operand is a NaN.  The comparison is performed
7035 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7036 *----------------------------------------------------------------------------*/
7037 
7038 int float128_le(float128 a, float128 b, float_status *status)
7039 {
7040     flag aSign, bSign;
7041 
7042     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7043               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7044          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7045               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7046        ) {
7047         float_raise(float_flag_invalid, status);
7048         return 0;
7049     }
7050     aSign = extractFloat128Sign( a );
7051     bSign = extractFloat128Sign( b );
7052     if ( aSign != bSign ) {
7053         return
7054                aSign
7055             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7056                  == 0 );
7057     }
7058     return
7059           aSign ? le128( b.high, b.low, a.high, a.low )
7060         : le128( a.high, a.low, b.high, b.low );
7061 
7062 }
7063 
7064 /*----------------------------------------------------------------------------
7065 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7066 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7067 | raised if either operand is a NaN.  The comparison is performed according
7068 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7069 *----------------------------------------------------------------------------*/
7070 
7071 int float128_lt(float128 a, float128 b, float_status *status)
7072 {
7073     flag aSign, bSign;
7074 
7075     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7076               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7077          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7078               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7079        ) {
7080         float_raise(float_flag_invalid, status);
7081         return 0;
7082     }
7083     aSign = extractFloat128Sign( a );
7084     bSign = extractFloat128Sign( b );
7085     if ( aSign != bSign ) {
7086         return
7087                aSign
7088             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7089                  != 0 );
7090     }
7091     return
7092           aSign ? lt128( b.high, b.low, a.high, a.low )
7093         : lt128( a.high, a.low, b.high, b.low );
7094 
7095 }
7096 
7097 /*----------------------------------------------------------------------------
7098 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7099 | be compared, and 0 otherwise.  The invalid exception is raised if either
7100 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7101 | Standard for Binary Floating-Point Arithmetic.
7102 *----------------------------------------------------------------------------*/
7103 
7104 int float128_unordered(float128 a, float128 b, float_status *status)
7105 {
7106     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7107               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7108          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7109               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7110        ) {
7111         float_raise(float_flag_invalid, status);
7112         return 1;
7113     }
7114     return 0;
7115 }
7116 
7117 /*----------------------------------------------------------------------------
7118 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7119 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7120 | exception.  The comparison is performed according to the IEC/IEEE Standard
7121 | for Binary Floating-Point Arithmetic.
7122 *----------------------------------------------------------------------------*/
7123 
7124 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7125 {
7126 
7127     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7128               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7129          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7130               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7131        ) {
7132         if (float128_is_signaling_nan(a, status)
7133          || float128_is_signaling_nan(b, status)) {
7134             float_raise(float_flag_invalid, status);
7135         }
7136         return 0;
7137     }
7138     return
7139            ( a.low == b.low )
7140         && (    ( a.high == b.high )
7141              || (    ( a.low == 0 )
7142                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7143            );
7144 
7145 }
7146 
7147 /*----------------------------------------------------------------------------
7148 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7149 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7150 | cause an exception.  Otherwise, the comparison is performed according to the
7151 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7152 *----------------------------------------------------------------------------*/
7153 
7154 int float128_le_quiet(float128 a, float128 b, float_status *status)
7155 {
7156     flag aSign, bSign;
7157 
7158     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7159               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7160          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7161               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7162        ) {
7163         if (float128_is_signaling_nan(a, status)
7164          || float128_is_signaling_nan(b, status)) {
7165             float_raise(float_flag_invalid, status);
7166         }
7167         return 0;
7168     }
7169     aSign = extractFloat128Sign( a );
7170     bSign = extractFloat128Sign( b );
7171     if ( aSign != bSign ) {
7172         return
7173                aSign
7174             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7175                  == 0 );
7176     }
7177     return
7178           aSign ? le128( b.high, b.low, a.high, a.low )
7179         : le128( a.high, a.low, b.high, b.low );
7180 
7181 }
7182 
7183 /*----------------------------------------------------------------------------
7184 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7185 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7186 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7187 | Standard for Binary Floating-Point Arithmetic.
7188 *----------------------------------------------------------------------------*/
7189 
7190 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7191 {
7192     flag aSign, bSign;
7193 
7194     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7195               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7196          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7197               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7198        ) {
7199         if (float128_is_signaling_nan(a, status)
7200          || float128_is_signaling_nan(b, status)) {
7201             float_raise(float_flag_invalid, status);
7202         }
7203         return 0;
7204     }
7205     aSign = extractFloat128Sign( a );
7206     bSign = extractFloat128Sign( b );
7207     if ( aSign != bSign ) {
7208         return
7209                aSign
7210             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7211                  != 0 );
7212     }
7213     return
7214           aSign ? lt128( b.high, b.low, a.high, a.low )
7215         : lt128( a.high, a.low, b.high, b.low );
7216 
7217 }
7218 
7219 /*----------------------------------------------------------------------------
7220 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7221 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7222 | comparison is performed according to the IEC/IEEE Standard for Binary
7223 | Floating-Point Arithmetic.
7224 *----------------------------------------------------------------------------*/
7225 
7226 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7227 {
7228     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7229               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7230          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7231               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7232        ) {
7233         if (float128_is_signaling_nan(a, status)
7234          || float128_is_signaling_nan(b, status)) {
7235             float_raise(float_flag_invalid, status);
7236         }
7237         return 1;
7238     }
7239     return 0;
7240 }
7241 
7242 /* misc functions */
7243 float32 uint32_to_float32(uint32_t a, float_status *status)
7244 {
7245     return int64_to_float32(a, status);
7246 }
7247 
7248 float64 uint32_to_float64(uint32_t a, float_status *status)
7249 {
7250     return int64_to_float64(a, status);
7251 }
7252 
7253 uint32_t float32_to_uint32(float32 a, float_status *status)
7254 {
7255     int64_t v;
7256     uint32_t res;
7257     int old_exc_flags = get_float_exception_flags(status);
7258 
7259     v = float32_to_int64(a, status);
7260     if (v < 0) {
7261         res = 0;
7262     } else if (v > 0xffffffff) {
7263         res = 0xffffffff;
7264     } else {
7265         return v;
7266     }
7267     set_float_exception_flags(old_exc_flags, status);
7268     float_raise(float_flag_invalid, status);
7269     return res;
7270 }
7271 
7272 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7273 {
7274     int64_t v;
7275     uint32_t res;
7276     int old_exc_flags = get_float_exception_flags(status);
7277 
7278     v = float32_to_int64_round_to_zero(a, status);
7279     if (v < 0) {
7280         res = 0;
7281     } else if (v > 0xffffffff) {
7282         res = 0xffffffff;
7283     } else {
7284         return v;
7285     }
7286     set_float_exception_flags(old_exc_flags, status);
7287     float_raise(float_flag_invalid, status);
7288     return res;
7289 }
7290 
7291 int16_t float32_to_int16(float32 a, float_status *status)
7292 {
7293     int32_t v;
7294     int16_t res;
7295     int old_exc_flags = get_float_exception_flags(status);
7296 
7297     v = float32_to_int32(a, status);
7298     if (v < -0x8000) {
7299         res = -0x8000;
7300     } else if (v > 0x7fff) {
7301         res = 0x7fff;
7302     } else {
7303         return v;
7304     }
7305 
7306     set_float_exception_flags(old_exc_flags, status);
7307     float_raise(float_flag_invalid, status);
7308     return res;
7309 }
7310 
7311 uint16_t float32_to_uint16(float32 a, float_status *status)
7312 {
7313     int32_t v;
7314     uint16_t res;
7315     int old_exc_flags = get_float_exception_flags(status);
7316 
7317     v = float32_to_int32(a, status);
7318     if (v < 0) {
7319         res = 0;
7320     } else if (v > 0xffff) {
7321         res = 0xffff;
7322     } else {
7323         return v;
7324     }
7325 
7326     set_float_exception_flags(old_exc_flags, status);
7327     float_raise(float_flag_invalid, status);
7328     return res;
7329 }
7330 
7331 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7332 {
7333     int64_t v;
7334     uint16_t res;
7335     int old_exc_flags = get_float_exception_flags(status);
7336 
7337     v = float32_to_int64_round_to_zero(a, status);
7338     if (v < 0) {
7339         res = 0;
7340     } else if (v > 0xffff) {
7341         res = 0xffff;
7342     } else {
7343         return v;
7344     }
7345     set_float_exception_flags(old_exc_flags, status);
7346     float_raise(float_flag_invalid, status);
7347     return res;
7348 }
7349 
7350 uint32_t float64_to_uint32(float64 a, float_status *status)
7351 {
7352     uint64_t v;
7353     uint32_t res;
7354     int old_exc_flags = get_float_exception_flags(status);
7355 
7356     v = float64_to_uint64(a, status);
7357     if (v > 0xffffffff) {
7358         res = 0xffffffff;
7359     } else {
7360         return v;
7361     }
7362     set_float_exception_flags(old_exc_flags, status);
7363     float_raise(float_flag_invalid, status);
7364     return res;
7365 }
7366 
7367 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7368 {
7369     uint64_t v;
7370     uint32_t res;
7371     int old_exc_flags = get_float_exception_flags(status);
7372 
7373     v = float64_to_uint64_round_to_zero(a, status);
7374     if (v > 0xffffffff) {
7375         res = 0xffffffff;
7376     } else {
7377         return v;
7378     }
7379     set_float_exception_flags(old_exc_flags, status);
7380     float_raise(float_flag_invalid, status);
7381     return res;
7382 }
7383 
7384 int16_t float64_to_int16(float64 a, float_status *status)
7385 {
7386     int64_t v;
7387     int16_t res;
7388     int old_exc_flags = get_float_exception_flags(status);
7389 
7390     v = float64_to_int32(a, status);
7391     if (v < -0x8000) {
7392         res = -0x8000;
7393     } else if (v > 0x7fff) {
7394         res = 0x7fff;
7395     } else {
7396         return v;
7397     }
7398 
7399     set_float_exception_flags(old_exc_flags, status);
7400     float_raise(float_flag_invalid, status);
7401     return res;
7402 }
7403 
7404 uint16_t float64_to_uint16(float64 a, float_status *status)
7405 {
7406     int64_t v;
7407     uint16_t res;
7408     int old_exc_flags = get_float_exception_flags(status);
7409 
7410     v = float64_to_int32(a, status);
7411     if (v < 0) {
7412         res = 0;
7413     } else if (v > 0xffff) {
7414         res = 0xffff;
7415     } else {
7416         return v;
7417     }
7418 
7419     set_float_exception_flags(old_exc_flags, status);
7420     float_raise(float_flag_invalid, status);
7421     return res;
7422 }
7423 
7424 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7425 {
7426     int64_t v;
7427     uint16_t res;
7428     int old_exc_flags = get_float_exception_flags(status);
7429 
7430     v = float64_to_int64_round_to_zero(a, status);
7431     if (v < 0) {
7432         res = 0;
7433     } else if (v > 0xffff) {
7434         res = 0xffff;
7435     } else {
7436         return v;
7437     }
7438     set_float_exception_flags(old_exc_flags, status);
7439     float_raise(float_flag_invalid, status);
7440     return res;
7441 }
7442 
7443 /*----------------------------------------------------------------------------
7444 | Returns the result of converting the double-precision floating-point value
7445 | `a' to the 64-bit unsigned integer format.  The conversion is
7446 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7447 | Arithmetic---which means in particular that the conversion is rounded
7448 | according to the current rounding mode.  If `a' is a NaN, the largest
7449 | positive integer is returned.  If the conversion overflows, the
7450 | largest unsigned integer is returned.  If 'a' is negative, the value is
7451 | rounded and zero is returned; negative values that do not round to zero
7452 | will raise the inexact exception.
7453 *----------------------------------------------------------------------------*/
7454 
7455 uint64_t float64_to_uint64(float64 a, float_status *status)
7456 {
7457     flag aSign;
7458     int aExp;
7459     int shiftCount;
7460     uint64_t aSig, aSigExtra;
7461     a = float64_squash_input_denormal(a, status);
7462 
7463     aSig = extractFloat64Frac(a);
7464     aExp = extractFloat64Exp(a);
7465     aSign = extractFloat64Sign(a);
7466     if (aSign && (aExp > 1022)) {
7467         float_raise(float_flag_invalid, status);
7468         if (float64_is_any_nan(a)) {
7469             return LIT64(0xFFFFFFFFFFFFFFFF);
7470         } else {
7471             return 0;
7472         }
7473     }
7474     if (aExp) {
7475         aSig |= LIT64(0x0010000000000000);
7476     }
7477     shiftCount = 0x433 - aExp;
7478     if (shiftCount <= 0) {
7479         if (0x43E < aExp) {
7480             float_raise(float_flag_invalid, status);
7481             return LIT64(0xFFFFFFFFFFFFFFFF);
7482         }
7483         aSigExtra = 0;
7484         aSig <<= -shiftCount;
7485     } else {
7486         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7487     }
7488     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7489 }
7490 
7491 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7492 {
7493     signed char current_rounding_mode = status->float_rounding_mode;
7494     set_float_rounding_mode(float_round_to_zero, status);
7495     uint64_t v = float64_to_uint64(a, status);
7496     set_float_rounding_mode(current_rounding_mode, status);
7497     return v;
7498 }
7499 
7500 #define COMPARE(s, nan_exp)                                                  \
7501 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7502                                       int is_quiet, float_status *status)    \
7503 {                                                                            \
7504     flag aSign, bSign;                                                       \
7505     uint ## s ## _t av, bv;                                                  \
7506     a = float ## s ## _squash_input_denormal(a, status);                     \
7507     b = float ## s ## _squash_input_denormal(b, status);                     \
7508                                                                              \
7509     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7510          extractFloat ## s ## Frac( a ) ) ||                                 \
7511         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7512           extractFloat ## s ## Frac( b ) )) {                                \
7513         if (!is_quiet ||                                                     \
7514             float ## s ## _is_signaling_nan(a, status) ||                  \
7515             float ## s ## _is_signaling_nan(b, status)) {                 \
7516             float_raise(float_flag_invalid, status);                         \
7517         }                                                                    \
7518         return float_relation_unordered;                                     \
7519     }                                                                        \
7520     aSign = extractFloat ## s ## Sign( a );                                  \
7521     bSign = extractFloat ## s ## Sign( b );                                  \
7522     av = float ## s ## _val(a);                                              \
7523     bv = float ## s ## _val(b);                                              \
7524     if ( aSign != bSign ) {                                                  \
7525         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7526             /* zero case */                                                  \
7527             return float_relation_equal;                                     \
7528         } else {                                                             \
7529             return 1 - (2 * aSign);                                          \
7530         }                                                                    \
7531     } else {                                                                 \
7532         if (av == bv) {                                                      \
7533             return float_relation_equal;                                     \
7534         } else {                                                             \
7535             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7536         }                                                                    \
7537     }                                                                        \
7538 }                                                                            \
7539                                                                              \
7540 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7541 {                                                                            \
7542     return float ## s ## _compare_internal(a, b, 0, status);                 \
7543 }                                                                            \
7544                                                                              \
7545 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7546                                  float_status *status)                       \
7547 {                                                                            \
7548     return float ## s ## _compare_internal(a, b, 1, status);                 \
7549 }
7550 
7551 COMPARE(32, 0xff)
7552 COMPARE(64, 0x7ff)
7553 
7554 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7555                                             int is_quiet, float_status *status)
7556 {
7557     flag aSign, bSign;
7558 
7559     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7560         float_raise(float_flag_invalid, status);
7561         return float_relation_unordered;
7562     }
7563     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7564           ( extractFloatx80Frac( a )<<1 ) ) ||
7565         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7566           ( extractFloatx80Frac( b )<<1 ) )) {
7567         if (!is_quiet ||
7568             floatx80_is_signaling_nan(a, status) ||
7569             floatx80_is_signaling_nan(b, status)) {
7570             float_raise(float_flag_invalid, status);
7571         }
7572         return float_relation_unordered;
7573     }
7574     aSign = extractFloatx80Sign( a );
7575     bSign = extractFloatx80Sign( b );
7576     if ( aSign != bSign ) {
7577 
7578         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7579              ( ( a.low | b.low ) == 0 ) ) {
7580             /* zero case */
7581             return float_relation_equal;
7582         } else {
7583             return 1 - (2 * aSign);
7584         }
7585     } else {
7586         if (a.low == b.low && a.high == b.high) {
7587             return float_relation_equal;
7588         } else {
7589             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7590         }
7591     }
7592 }
7593 
7594 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7595 {
7596     return floatx80_compare_internal(a, b, 0, status);
7597 }
7598 
7599 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7600 {
7601     return floatx80_compare_internal(a, b, 1, status);
7602 }
7603 
7604 static inline int float128_compare_internal(float128 a, float128 b,
7605                                             int is_quiet, float_status *status)
7606 {
7607     flag aSign, bSign;
7608 
7609     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7610           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7611         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7612           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7613         if (!is_quiet ||
7614             float128_is_signaling_nan(a, status) ||
7615             float128_is_signaling_nan(b, status)) {
7616             float_raise(float_flag_invalid, status);
7617         }
7618         return float_relation_unordered;
7619     }
7620     aSign = extractFloat128Sign( a );
7621     bSign = extractFloat128Sign( b );
7622     if ( aSign != bSign ) {
7623         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7624             /* zero case */
7625             return float_relation_equal;
7626         } else {
7627             return 1 - (2 * aSign);
7628         }
7629     } else {
7630         if (a.low == b.low && a.high == b.high) {
7631             return float_relation_equal;
7632         } else {
7633             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7634         }
7635     }
7636 }
7637 
7638 int float128_compare(float128 a, float128 b, float_status *status)
7639 {
7640     return float128_compare_internal(a, b, 0, status);
7641 }
7642 
7643 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7644 {
7645     return float128_compare_internal(a, b, 1, status);
7646 }
7647 
7648 /* min() and max() functions. These can't be implemented as
7649  * 'compare and pick one input' because that would mishandle
7650  * NaNs and +0 vs -0.
7651  *
7652  * minnum() and maxnum() functions. These are similar to the min()
7653  * and max() functions but if one of the arguments is a QNaN and
7654  * the other is numerical then the numerical argument is returned.
7655  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7656  * and maxNum() operations. min() and max() are the typical min/max
7657  * semantics provided by many CPUs which predate that specification.
7658  *
7659  * minnummag() and maxnummag() functions correspond to minNumMag()
7660  * and minNumMag() from the IEEE-754 2008.
7661  */
7662 #define MINMAX(s)                                                       \
7663 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7664                                                int ismin, int isieee,   \
7665                                                int ismag,               \
7666                                                float_status *status)    \
7667 {                                                                       \
7668     flag aSign, bSign;                                                  \
7669     uint ## s ## _t av, bv, aav, abv;                                   \
7670     a = float ## s ## _squash_input_denormal(a, status);                \
7671     b = float ## s ## _squash_input_denormal(b, status);                \
7672     if (float ## s ## _is_any_nan(a) ||                                 \
7673         float ## s ## _is_any_nan(b)) {                                 \
7674         if (isieee) {                                                   \
7675             if (float ## s ## _is_quiet_nan(a, status) &&               \
7676                 !float ## s ##_is_any_nan(b)) {                         \
7677                 return b;                                               \
7678             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7679                        !float ## s ## _is_any_nan(a)) {                \
7680                 return a;                                               \
7681             }                                                           \
7682         }                                                               \
7683         return propagateFloat ## s ## NaN(a, b, status);                \
7684     }                                                                   \
7685     aSign = extractFloat ## s ## Sign(a);                               \
7686     bSign = extractFloat ## s ## Sign(b);                               \
7687     av = float ## s ## _val(a);                                         \
7688     bv = float ## s ## _val(b);                                         \
7689     if (ismag) {                                                        \
7690         aav = float ## s ## _abs(av);                                   \
7691         abv = float ## s ## _abs(bv);                                   \
7692         if (aav != abv) {                                               \
7693             if (ismin) {                                                \
7694                 return (aav < abv) ? a : b;                             \
7695             } else {                                                    \
7696                 return (aav < abv) ? b : a;                             \
7697             }                                                           \
7698         }                                                               \
7699     }                                                                   \
7700     if (aSign != bSign) {                                               \
7701         if (ismin) {                                                    \
7702             return aSign ? a : b;                                       \
7703         } else {                                                        \
7704             return aSign ? b : a;                                       \
7705         }                                                               \
7706     } else {                                                            \
7707         if (ismin) {                                                    \
7708             return (aSign ^ (av < bv)) ? a : b;                         \
7709         } else {                                                        \
7710             return (aSign ^ (av < bv)) ? b : a;                         \
7711         }                                                               \
7712     }                                                                   \
7713 }                                                                       \
7714                                                                         \
7715 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7716                               float_status *status)                     \
7717 {                                                                       \
7718     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7719 }                                                                       \
7720                                                                         \
7721 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7722                               float_status *status)                     \
7723 {                                                                       \
7724     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7725 }                                                                       \
7726                                                                         \
7727 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7728                                  float_status *status)                  \
7729 {                                                                       \
7730     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7731 }                                                                       \
7732                                                                         \
7733 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7734                                  float_status *status)                  \
7735 {                                                                       \
7736     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7737 }                                                                       \
7738                                                                         \
7739 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7740                                     float_status *status)               \
7741 {                                                                       \
7742     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7743 }                                                                       \
7744                                                                         \
7745 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7746                                     float_status *status)               \
7747 {                                                                       \
7748     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7749 }
7750 
7751 MINMAX(32)
7752 MINMAX(64)
7753 
7754 
7755 /* Multiply A by 2 raised to the power N.  */
7756 float32 float32_scalbn(float32 a, int n, float_status *status)
7757 {
7758     flag aSign;
7759     int16_t aExp;
7760     uint32_t aSig;
7761 
7762     a = float32_squash_input_denormal(a, status);
7763     aSig = extractFloat32Frac( a );
7764     aExp = extractFloat32Exp( a );
7765     aSign = extractFloat32Sign( a );
7766 
7767     if ( aExp == 0xFF ) {
7768         if ( aSig ) {
7769             return propagateFloat32NaN(a, a, status);
7770         }
7771         return a;
7772     }
7773     if (aExp != 0) {
7774         aSig |= 0x00800000;
7775     } else if (aSig == 0) {
7776         return a;
7777     } else {
7778         aExp++;
7779     }
7780 
7781     if (n > 0x200) {
7782         n = 0x200;
7783     } else if (n < -0x200) {
7784         n = -0x200;
7785     }
7786 
7787     aExp += n - 1;
7788     aSig <<= 7;
7789     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7790 }
7791 
7792 float64 float64_scalbn(float64 a, int n, float_status *status)
7793 {
7794     flag aSign;
7795     int16_t aExp;
7796     uint64_t aSig;
7797 
7798     a = float64_squash_input_denormal(a, status);
7799     aSig = extractFloat64Frac( a );
7800     aExp = extractFloat64Exp( a );
7801     aSign = extractFloat64Sign( a );
7802 
7803     if ( aExp == 0x7FF ) {
7804         if ( aSig ) {
7805             return propagateFloat64NaN(a, a, status);
7806         }
7807         return a;
7808     }
7809     if (aExp != 0) {
7810         aSig |= LIT64( 0x0010000000000000 );
7811     } else if (aSig == 0) {
7812         return a;
7813     } else {
7814         aExp++;
7815     }
7816 
7817     if (n > 0x1000) {
7818         n = 0x1000;
7819     } else if (n < -0x1000) {
7820         n = -0x1000;
7821     }
7822 
7823     aExp += n - 1;
7824     aSig <<= 10;
7825     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7826 }
7827 
7828 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7829 {
7830     flag aSign;
7831     int32_t aExp;
7832     uint64_t aSig;
7833 
7834     if (floatx80_invalid_encoding(a)) {
7835         float_raise(float_flag_invalid, status);
7836         return floatx80_default_nan(status);
7837     }
7838     aSig = extractFloatx80Frac( a );
7839     aExp = extractFloatx80Exp( a );
7840     aSign = extractFloatx80Sign( a );
7841 
7842     if ( aExp == 0x7FFF ) {
7843         if ( aSig<<1 ) {
7844             return propagateFloatx80NaN(a, a, status);
7845         }
7846         return a;
7847     }
7848 
7849     if (aExp == 0) {
7850         if (aSig == 0) {
7851             return a;
7852         }
7853         aExp++;
7854     }
7855 
7856     if (n > 0x10000) {
7857         n = 0x10000;
7858     } else if (n < -0x10000) {
7859         n = -0x10000;
7860     }
7861 
7862     aExp += n;
7863     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7864                                          aSign, aExp, aSig, 0, status);
7865 }
7866 
7867 float128 float128_scalbn(float128 a, int n, float_status *status)
7868 {
7869     flag aSign;
7870     int32_t aExp;
7871     uint64_t aSig0, aSig1;
7872 
7873     aSig1 = extractFloat128Frac1( a );
7874     aSig0 = extractFloat128Frac0( a );
7875     aExp = extractFloat128Exp( a );
7876     aSign = extractFloat128Sign( a );
7877     if ( aExp == 0x7FFF ) {
7878         if ( aSig0 | aSig1 ) {
7879             return propagateFloat128NaN(a, a, status);
7880         }
7881         return a;
7882     }
7883     if (aExp != 0) {
7884         aSig0 |= LIT64( 0x0001000000000000 );
7885     } else if (aSig0 == 0 && aSig1 == 0) {
7886         return a;
7887     } else {
7888         aExp++;
7889     }
7890 
7891     if (n > 0x10000) {
7892         n = 0x10000;
7893     } else if (n < -0x10000) {
7894         n = -0x10000;
7895     }
7896 
7897     aExp += n - 1;
7898     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7899                                          , status);
7900 
7901 }
7902