xref: /openbmc/qemu/fpu/softfloat.c (revision 7f709ce7)
1 /*
2  * QEMU float support
3  *
4  * The code in this source file is derived from release 2a of the SoftFloat
5  * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6  * some later contributions) are provided under that license, as detailed below.
7  * It has subsequently been modified by contributors to the QEMU Project,
8  * so some portions are provided under:
9  *  the SoftFloat-2a license
10  *  the BSD license
11  *  GPL-v2-or-later
12  *
13  * Any future contributions to this file after December 1st 2014 will be
14  * taken to be licensed under the Softfloat-2a license unless specifically
15  * indicated otherwise.
16  */
17 
18 /*
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
22 
23 Written by John R. Hauser.  This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704.  Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980.  The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
32 
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
38 
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
43 
44 ===============================================================================
45 */
46 
47 /* BSD licensing:
48  * Copyright (c) 2006, Fabrice Bellard
49  * All rights reserved.
50  *
51  * Redistribution and use in source and binary forms, with or without
52  * modification, are permitted provided that the following conditions are met:
53  *
54  * 1. Redistributions of source code must retain the above copyright notice,
55  * this list of conditions and the following disclaimer.
56  *
57  * 2. Redistributions in binary form must reproduce the above copyright notice,
58  * this list of conditions and the following disclaimer in the documentation
59  * and/or other materials provided with the distribution.
60  *
61  * 3. Neither the name of the copyright holder nor the names of its contributors
62  * may be used to endorse or promote products derived from this software without
63  * specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75  * THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79  * version 2 or later. See the COPYING file in the top-level directory.
80  */
81 
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83  * target-dependent and needs the TARGET_* macros.
84  */
85 #include "qemu/osdep.h"
86 
87 #include "fpu/softfloat.h"
88 
89 /* We only need stdlib for abort() */
90 
91 /*----------------------------------------------------------------------------
92 | Primitive arithmetic functions, including multi-word arithmetic, and
93 | division and square root approximations.  (Can be specialized to target if
94 | desired.)
95 *----------------------------------------------------------------------------*/
96 #include "softfloat-macros.h"
97 
98 /*----------------------------------------------------------------------------
99 | Functions and definitions to determine:  (1) whether tininess for underflow
100 | is detected before or after rounding by default, (2) what (if anything)
101 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
102 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
103 | are propagated from function inputs to output.  These details are target-
104 | specific.
105 *----------------------------------------------------------------------------*/
106 #include "softfloat-specialize.h"
107 
108 /*----------------------------------------------------------------------------
109 | Returns the fraction bits of the half-precision floating-point value `a'.
110 *----------------------------------------------------------------------------*/
111 
112 static inline uint32_t extractFloat16Frac(float16 a)
113 {
114     return float16_val(a) & 0x3ff;
115 }
116 
117 /*----------------------------------------------------------------------------
118 | Returns the exponent bits of the half-precision floating-point value `a'.
119 *----------------------------------------------------------------------------*/
120 
121 static inline int extractFloat16Exp(float16 a)
122 {
123     return (float16_val(a) >> 10) & 0x1f;
124 }
125 
126 /*----------------------------------------------------------------------------
127 | Returns the sign bit of the single-precision floating-point value `a'.
128 *----------------------------------------------------------------------------*/
129 
130 static inline flag extractFloat16Sign(float16 a)
131 {
132     return float16_val(a)>>15;
133 }
134 
135 /*----------------------------------------------------------------------------
136 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
137 | and 7, and returns the properly rounded 32-bit integer corresponding to the
138 | input.  If `zSign' is 1, the input is negated before being converted to an
139 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
140 | is simply rounded to an integer, with the inexact exception raised if the
141 | input cannot be represented exactly as an integer.  However, if the fixed-
142 | point input is too large, the invalid exception is raised and the largest
143 | positive or negative integer is returned.
144 *----------------------------------------------------------------------------*/
145 
146 static int32_t roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
147 {
148     int8_t roundingMode;
149     flag roundNearestEven;
150     int8_t roundIncrement, roundBits;
151     int32_t z;
152 
153     roundingMode = status->float_rounding_mode;
154     roundNearestEven = ( roundingMode == float_round_nearest_even );
155     switch (roundingMode) {
156     case float_round_nearest_even:
157     case float_round_ties_away:
158         roundIncrement = 0x40;
159         break;
160     case float_round_to_zero:
161         roundIncrement = 0;
162         break;
163     case float_round_up:
164         roundIncrement = zSign ? 0 : 0x7f;
165         break;
166     case float_round_down:
167         roundIncrement = zSign ? 0x7f : 0;
168         break;
169     default:
170         abort();
171     }
172     roundBits = absZ & 0x7F;
173     absZ = ( absZ + roundIncrement )>>7;
174     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
175     z = absZ;
176     if ( zSign ) z = - z;
177     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
178         float_raise(float_flag_invalid, status);
179         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
180     }
181     if (roundBits) {
182         status->float_exception_flags |= float_flag_inexact;
183     }
184     return z;
185 
186 }
187 
188 /*----------------------------------------------------------------------------
189 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
190 | `absZ1', with binary point between bits 63 and 64 (between the input words),
191 | and returns the properly rounded 64-bit integer corresponding to the input.
192 | If `zSign' is 1, the input is negated before being converted to an integer.
193 | Ordinarily, the fixed-point input is simply rounded to an integer, with
194 | the inexact exception raised if the input cannot be represented exactly as
195 | an integer.  However, if the fixed-point input is too large, the invalid
196 | exception is raised and the largest positive or negative integer is
197 | returned.
198 *----------------------------------------------------------------------------*/
199 
200 static int64_t roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
201                                float_status *status)
202 {
203     int8_t roundingMode;
204     flag roundNearestEven, increment;
205     int64_t z;
206 
207     roundingMode = status->float_rounding_mode;
208     roundNearestEven = ( roundingMode == float_round_nearest_even );
209     switch (roundingMode) {
210     case float_round_nearest_even:
211     case float_round_ties_away:
212         increment = ((int64_t) absZ1 < 0);
213         break;
214     case float_round_to_zero:
215         increment = 0;
216         break;
217     case float_round_up:
218         increment = !zSign && absZ1;
219         break;
220     case float_round_down:
221         increment = zSign && absZ1;
222         break;
223     default:
224         abort();
225     }
226     if ( increment ) {
227         ++absZ0;
228         if ( absZ0 == 0 ) goto overflow;
229         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
230     }
231     z = absZ0;
232     if ( zSign ) z = - z;
233     if ( z && ( ( z < 0 ) ^ zSign ) ) {
234  overflow:
235         float_raise(float_flag_invalid, status);
236         return
237               zSign ? (int64_t) LIT64( 0x8000000000000000 )
238             : LIT64( 0x7FFFFFFFFFFFFFFF );
239     }
240     if (absZ1) {
241         status->float_exception_flags |= float_flag_inexact;
242     }
243     return z;
244 
245 }
246 
247 /*----------------------------------------------------------------------------
248 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
249 | `absZ1', with binary point between bits 63 and 64 (between the input words),
250 | and returns the properly rounded 64-bit unsigned integer corresponding to the
251 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
252 | with the inexact exception raised if the input cannot be represented exactly
253 | as an integer.  However, if the fixed-point input is too large, the invalid
254 | exception is raised and the largest unsigned integer is returned.
255 *----------------------------------------------------------------------------*/
256 
257 static int64_t roundAndPackUint64(flag zSign, uint64_t absZ0,
258                                 uint64_t absZ1, float_status *status)
259 {
260     int8_t roundingMode;
261     flag roundNearestEven, increment;
262 
263     roundingMode = status->float_rounding_mode;
264     roundNearestEven = (roundingMode == float_round_nearest_even);
265     switch (roundingMode) {
266     case float_round_nearest_even:
267     case float_round_ties_away:
268         increment = ((int64_t)absZ1 < 0);
269         break;
270     case float_round_to_zero:
271         increment = 0;
272         break;
273     case float_round_up:
274         increment = !zSign && absZ1;
275         break;
276     case float_round_down:
277         increment = zSign && absZ1;
278         break;
279     default:
280         abort();
281     }
282     if (increment) {
283         ++absZ0;
284         if (absZ0 == 0) {
285             float_raise(float_flag_invalid, status);
286             return LIT64(0xFFFFFFFFFFFFFFFF);
287         }
288         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
289     }
290 
291     if (zSign && absZ0) {
292         float_raise(float_flag_invalid, status);
293         return 0;
294     }
295 
296     if (absZ1) {
297         status->float_exception_flags |= float_flag_inexact;
298     }
299     return absZ0;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Returns the fraction bits of the single-precision floating-point value `a'.
304 *----------------------------------------------------------------------------*/
305 
306 static inline uint32_t extractFloat32Frac( float32 a )
307 {
308 
309     return float32_val(a) & 0x007FFFFF;
310 
311 }
312 
313 /*----------------------------------------------------------------------------
314 | Returns the exponent bits of the single-precision floating-point value `a'.
315 *----------------------------------------------------------------------------*/
316 
317 static inline int extractFloat32Exp(float32 a)
318 {
319 
320     return ( float32_val(a)>>23 ) & 0xFF;
321 
322 }
323 
324 /*----------------------------------------------------------------------------
325 | Returns the sign bit of the single-precision floating-point value `a'.
326 *----------------------------------------------------------------------------*/
327 
328 static inline flag extractFloat32Sign( float32 a )
329 {
330 
331     return float32_val(a)>>31;
332 
333 }
334 
335 /*----------------------------------------------------------------------------
336 | If `a' is denormal and we are in flush-to-zero mode then set the
337 | input-denormal exception and return zero. Otherwise just return the value.
338 *----------------------------------------------------------------------------*/
339 float32 float32_squash_input_denormal(float32 a, float_status *status)
340 {
341     if (status->flush_inputs_to_zero) {
342         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
343             float_raise(float_flag_input_denormal, status);
344             return make_float32(float32_val(a) & 0x80000000);
345         }
346     }
347     return a;
348 }
349 
350 /*----------------------------------------------------------------------------
351 | Normalizes the subnormal single-precision floating-point value represented
352 | by the denormalized significand `aSig'.  The normalized exponent and
353 | significand are stored at the locations pointed to by `zExpPtr' and
354 | `zSigPtr', respectively.
355 *----------------------------------------------------------------------------*/
356 
357 static void
358  normalizeFloat32Subnormal(uint32_t aSig, int *zExpPtr, uint32_t *zSigPtr)
359 {
360     int8_t shiftCount;
361 
362     shiftCount = countLeadingZeros32( aSig ) - 8;
363     *zSigPtr = aSig<<shiftCount;
364     *zExpPtr = 1 - shiftCount;
365 
366 }
367 
368 /*----------------------------------------------------------------------------
369 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
370 | single-precision floating-point value, returning the result.  After being
371 | shifted into the proper positions, the three fields are simply added
372 | together to form the result.  This means that any integer portion of `zSig'
373 | will be added into the exponent.  Since a properly normalized significand
374 | will have an integer portion equal to 1, the `zExp' input should be 1 less
375 | than the desired result exponent whenever `zSig' is a complete, normalized
376 | significand.
377 *----------------------------------------------------------------------------*/
378 
379 static inline float32 packFloat32(flag zSign, int zExp, uint32_t zSig)
380 {
381 
382     return make_float32(
383           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
384 
385 }
386 
387 /*----------------------------------------------------------------------------
388 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
389 | and significand `zSig', and returns the proper single-precision floating-
390 | point value corresponding to the abstract input.  Ordinarily, the abstract
391 | value is simply rounded and packed into the single-precision format, with
392 | the inexact exception raised if the abstract input cannot be represented
393 | exactly.  However, if the abstract value is too large, the overflow and
394 | inexact exceptions are raised and an infinity or maximal finite value is
395 | returned.  If the abstract value is too small, the input value is rounded to
396 | a subnormal number, and the underflow and inexact exceptions are raised if
397 | the abstract input cannot be represented exactly as a subnormal single-
398 | precision floating-point number.
399 |     The input significand `zSig' has its binary point between bits 30
400 | and 29, which is 7 bits to the left of the usual location.  This shifted
401 | significand must be normalized or smaller.  If `zSig' is not normalized,
402 | `zExp' must be 0; in that case, the result returned is a subnormal number,
403 | and it must not require rounding.  In the usual case that `zSig' is
404 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
405 | The handling of underflow and overflow follows the IEC/IEEE Standard for
406 | Binary Floating-Point Arithmetic.
407 *----------------------------------------------------------------------------*/
408 
409 static float32 roundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
410                                    float_status *status)
411 {
412     int8_t roundingMode;
413     flag roundNearestEven;
414     int8_t roundIncrement, roundBits;
415     flag isTiny;
416 
417     roundingMode = status->float_rounding_mode;
418     roundNearestEven = ( roundingMode == float_round_nearest_even );
419     switch (roundingMode) {
420     case float_round_nearest_even:
421     case float_round_ties_away:
422         roundIncrement = 0x40;
423         break;
424     case float_round_to_zero:
425         roundIncrement = 0;
426         break;
427     case float_round_up:
428         roundIncrement = zSign ? 0 : 0x7f;
429         break;
430     case float_round_down:
431         roundIncrement = zSign ? 0x7f : 0;
432         break;
433     default:
434         abort();
435         break;
436     }
437     roundBits = zSig & 0x7F;
438     if ( 0xFD <= (uint16_t) zExp ) {
439         if (    ( 0xFD < zExp )
440              || (    ( zExp == 0xFD )
441                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
442            ) {
443             float_raise(float_flag_overflow | float_flag_inexact, status);
444             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
445         }
446         if ( zExp < 0 ) {
447             if (status->flush_to_zero) {
448                 float_raise(float_flag_output_denormal, status);
449                 return packFloat32(zSign, 0, 0);
450             }
451             isTiny =
452                 (status->float_detect_tininess
453                  == float_tininess_before_rounding)
454                 || ( zExp < -1 )
455                 || ( zSig + roundIncrement < 0x80000000 );
456             shift32RightJamming( zSig, - zExp, &zSig );
457             zExp = 0;
458             roundBits = zSig & 0x7F;
459             if (isTiny && roundBits) {
460                 float_raise(float_flag_underflow, status);
461             }
462         }
463     }
464     if (roundBits) {
465         status->float_exception_flags |= float_flag_inexact;
466     }
467     zSig = ( zSig + roundIncrement )>>7;
468     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
469     if ( zSig == 0 ) zExp = 0;
470     return packFloat32( zSign, zExp, zSig );
471 
472 }
473 
474 /*----------------------------------------------------------------------------
475 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
476 | and significand `zSig', and returns the proper single-precision floating-
477 | point value corresponding to the abstract input.  This routine is just like
478 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
479 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
480 | floating-point exponent.
481 *----------------------------------------------------------------------------*/
482 
483 static float32
484  normalizeRoundAndPackFloat32(flag zSign, int zExp, uint32_t zSig,
485                               float_status *status)
486 {
487     int8_t shiftCount;
488 
489     shiftCount = countLeadingZeros32( zSig ) - 1;
490     return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
491                                status);
492 
493 }
494 
495 /*----------------------------------------------------------------------------
496 | Returns the fraction bits of the double-precision floating-point value `a'.
497 *----------------------------------------------------------------------------*/
498 
499 static inline uint64_t extractFloat64Frac( float64 a )
500 {
501 
502     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
503 
504 }
505 
506 /*----------------------------------------------------------------------------
507 | Returns the exponent bits of the double-precision floating-point value `a'.
508 *----------------------------------------------------------------------------*/
509 
510 static inline int extractFloat64Exp(float64 a)
511 {
512 
513     return ( float64_val(a)>>52 ) & 0x7FF;
514 
515 }
516 
517 /*----------------------------------------------------------------------------
518 | Returns the sign bit of the double-precision floating-point value `a'.
519 *----------------------------------------------------------------------------*/
520 
521 static inline flag extractFloat64Sign( float64 a )
522 {
523 
524     return float64_val(a)>>63;
525 
526 }
527 
528 /*----------------------------------------------------------------------------
529 | If `a' is denormal and we are in flush-to-zero mode then set the
530 | input-denormal exception and return zero. Otherwise just return the value.
531 *----------------------------------------------------------------------------*/
532 float64 float64_squash_input_denormal(float64 a, float_status *status)
533 {
534     if (status->flush_inputs_to_zero) {
535         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
536             float_raise(float_flag_input_denormal, status);
537             return make_float64(float64_val(a) & (1ULL << 63));
538         }
539     }
540     return a;
541 }
542 
543 /*----------------------------------------------------------------------------
544 | Normalizes the subnormal double-precision floating-point value represented
545 | by the denormalized significand `aSig'.  The normalized exponent and
546 | significand are stored at the locations pointed to by `zExpPtr' and
547 | `zSigPtr', respectively.
548 *----------------------------------------------------------------------------*/
549 
550 static void
551  normalizeFloat64Subnormal(uint64_t aSig, int *zExpPtr, uint64_t *zSigPtr)
552 {
553     int8_t shiftCount;
554 
555     shiftCount = countLeadingZeros64( aSig ) - 11;
556     *zSigPtr = aSig<<shiftCount;
557     *zExpPtr = 1 - shiftCount;
558 
559 }
560 
561 /*----------------------------------------------------------------------------
562 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
563 | double-precision floating-point value, returning the result.  After being
564 | shifted into the proper positions, the three fields are simply added
565 | together to form the result.  This means that any integer portion of `zSig'
566 | will be added into the exponent.  Since a properly normalized significand
567 | will have an integer portion equal to 1, the `zExp' input should be 1 less
568 | than the desired result exponent whenever `zSig' is a complete, normalized
569 | significand.
570 *----------------------------------------------------------------------------*/
571 
572 static inline float64 packFloat64(flag zSign, int zExp, uint64_t zSig)
573 {
574 
575     return make_float64(
576         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
577 
578 }
579 
580 /*----------------------------------------------------------------------------
581 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
582 | and significand `zSig', and returns the proper double-precision floating-
583 | point value corresponding to the abstract input.  Ordinarily, the abstract
584 | value is simply rounded and packed into the double-precision format, with
585 | the inexact exception raised if the abstract input cannot be represented
586 | exactly.  However, if the abstract value is too large, the overflow and
587 | inexact exceptions are raised and an infinity or maximal finite value is
588 | returned.  If the abstract value is too small, the input value is rounded to
589 | a subnormal number, and the underflow and inexact exceptions are raised if
590 | the abstract input cannot be represented exactly as a subnormal double-
591 | precision floating-point number.
592 |     The input significand `zSig' has its binary point between bits 62
593 | and 61, which is 10 bits to the left of the usual location.  This shifted
594 | significand must be normalized or smaller.  If `zSig' is not normalized,
595 | `zExp' must be 0; in that case, the result returned is a subnormal number,
596 | and it must not require rounding.  In the usual case that `zSig' is
597 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
598 | The handling of underflow and overflow follows the IEC/IEEE Standard for
599 | Binary Floating-Point Arithmetic.
600 *----------------------------------------------------------------------------*/
601 
602 static float64 roundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
603                                    float_status *status)
604 {
605     int8_t roundingMode;
606     flag roundNearestEven;
607     int roundIncrement, roundBits;
608     flag isTiny;
609 
610     roundingMode = status->float_rounding_mode;
611     roundNearestEven = ( roundingMode == float_round_nearest_even );
612     switch (roundingMode) {
613     case float_round_nearest_even:
614     case float_round_ties_away:
615         roundIncrement = 0x200;
616         break;
617     case float_round_to_zero:
618         roundIncrement = 0;
619         break;
620     case float_round_up:
621         roundIncrement = zSign ? 0 : 0x3ff;
622         break;
623     case float_round_down:
624         roundIncrement = zSign ? 0x3ff : 0;
625         break;
626     case float_round_to_odd:
627         roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
628         break;
629     default:
630         abort();
631     }
632     roundBits = zSig & 0x3FF;
633     if ( 0x7FD <= (uint16_t) zExp ) {
634         if (    ( 0x7FD < zExp )
635              || (    ( zExp == 0x7FD )
636                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
637            ) {
638             bool overflow_to_inf = roundingMode != float_round_to_odd &&
639                                    roundIncrement != 0;
640             float_raise(float_flag_overflow | float_flag_inexact, status);
641             return packFloat64(zSign, 0x7FF, -(!overflow_to_inf));
642         }
643         if ( zExp < 0 ) {
644             if (status->flush_to_zero) {
645                 float_raise(float_flag_output_denormal, status);
646                 return packFloat64(zSign, 0, 0);
647             }
648             isTiny =
649                    (status->float_detect_tininess
650                     == float_tininess_before_rounding)
651                 || ( zExp < -1 )
652                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
653             shift64RightJamming( zSig, - zExp, &zSig );
654             zExp = 0;
655             roundBits = zSig & 0x3FF;
656             if (isTiny && roundBits) {
657                 float_raise(float_flag_underflow, status);
658             }
659             if (roundingMode == float_round_to_odd) {
660                 /*
661                  * For round-to-odd case, the roundIncrement depends on
662                  * zSig which just changed.
663                  */
664                 roundIncrement = (zSig & 0x400) ? 0 : 0x3ff;
665             }
666         }
667     }
668     if (roundBits) {
669         status->float_exception_flags |= float_flag_inexact;
670     }
671     zSig = ( zSig + roundIncrement )>>10;
672     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
673     if ( zSig == 0 ) zExp = 0;
674     return packFloat64( zSign, zExp, zSig );
675 
676 }
677 
678 /*----------------------------------------------------------------------------
679 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
680 | and significand `zSig', and returns the proper double-precision floating-
681 | point value corresponding to the abstract input.  This routine is just like
682 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
683 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
684 | floating-point exponent.
685 *----------------------------------------------------------------------------*/
686 
687 static float64
688  normalizeRoundAndPackFloat64(flag zSign, int zExp, uint64_t zSig,
689                               float_status *status)
690 {
691     int8_t shiftCount;
692 
693     shiftCount = countLeadingZeros64( zSig ) - 1;
694     return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
695                                status);
696 
697 }
698 
699 /*----------------------------------------------------------------------------
700 | Returns the fraction bits of the extended double-precision floating-point
701 | value `a'.
702 *----------------------------------------------------------------------------*/
703 
704 static inline uint64_t extractFloatx80Frac( floatx80 a )
705 {
706 
707     return a.low;
708 
709 }
710 
711 /*----------------------------------------------------------------------------
712 | Returns the exponent bits of the extended double-precision floating-point
713 | value `a'.
714 *----------------------------------------------------------------------------*/
715 
716 static inline int32_t extractFloatx80Exp( floatx80 a )
717 {
718 
719     return a.high & 0x7FFF;
720 
721 }
722 
723 /*----------------------------------------------------------------------------
724 | Returns the sign bit of the extended double-precision floating-point value
725 | `a'.
726 *----------------------------------------------------------------------------*/
727 
728 static inline flag extractFloatx80Sign( floatx80 a )
729 {
730 
731     return a.high>>15;
732 
733 }
734 
735 /*----------------------------------------------------------------------------
736 | Normalizes the subnormal extended double-precision floating-point value
737 | represented by the denormalized significand `aSig'.  The normalized exponent
738 | and significand are stored at the locations pointed to by `zExpPtr' and
739 | `zSigPtr', respectively.
740 *----------------------------------------------------------------------------*/
741 
742 static void
743  normalizeFloatx80Subnormal( uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr )
744 {
745     int8_t shiftCount;
746 
747     shiftCount = countLeadingZeros64( aSig );
748     *zSigPtr = aSig<<shiftCount;
749     *zExpPtr = 1 - shiftCount;
750 
751 }
752 
753 /*----------------------------------------------------------------------------
754 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
755 | extended double-precision floating-point value, returning the result.
756 *----------------------------------------------------------------------------*/
757 
758 static inline floatx80 packFloatx80( flag zSign, int32_t zExp, uint64_t zSig )
759 {
760     floatx80 z;
761 
762     z.low = zSig;
763     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
764     return z;
765 
766 }
767 
768 /*----------------------------------------------------------------------------
769 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
770 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
771 | and returns the proper extended double-precision floating-point value
772 | corresponding to the abstract input.  Ordinarily, the abstract value is
773 | rounded and packed into the extended double-precision format, with the
774 | inexact exception raised if the abstract input cannot be represented
775 | exactly.  However, if the abstract value is too large, the overflow and
776 | inexact exceptions are raised and an infinity or maximal finite value is
777 | returned.  If the abstract value is too small, the input value is rounded to
778 | a subnormal number, and the underflow and inexact exceptions are raised if
779 | the abstract input cannot be represented exactly as a subnormal extended
780 | double-precision floating-point number.
781 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
782 | number of bits as single or double precision, respectively.  Otherwise, the
783 | result is rounded to the full precision of the extended double-precision
784 | format.
785 |     The input significand must be normalized or smaller.  If the input
786 | significand is not normalized, `zExp' must be 0; in that case, the result
787 | returned is a subnormal number, and it must not require rounding.  The
788 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
789 | Floating-Point Arithmetic.
790 *----------------------------------------------------------------------------*/
791 
792 static floatx80 roundAndPackFloatx80(int8_t roundingPrecision, flag zSign,
793                                      int32_t zExp, uint64_t zSig0, uint64_t zSig1,
794                                      float_status *status)
795 {
796     int8_t roundingMode;
797     flag roundNearestEven, increment, isTiny;
798     int64_t roundIncrement, roundMask, roundBits;
799 
800     roundingMode = status->float_rounding_mode;
801     roundNearestEven = ( roundingMode == float_round_nearest_even );
802     if ( roundingPrecision == 80 ) goto precision80;
803     if ( roundingPrecision == 64 ) {
804         roundIncrement = LIT64( 0x0000000000000400 );
805         roundMask = LIT64( 0x00000000000007FF );
806     }
807     else if ( roundingPrecision == 32 ) {
808         roundIncrement = LIT64( 0x0000008000000000 );
809         roundMask = LIT64( 0x000000FFFFFFFFFF );
810     }
811     else {
812         goto precision80;
813     }
814     zSig0 |= ( zSig1 != 0 );
815     switch (roundingMode) {
816     case float_round_nearest_even:
817     case float_round_ties_away:
818         break;
819     case float_round_to_zero:
820         roundIncrement = 0;
821         break;
822     case float_round_up:
823         roundIncrement = zSign ? 0 : roundMask;
824         break;
825     case float_round_down:
826         roundIncrement = zSign ? roundMask : 0;
827         break;
828     default:
829         abort();
830     }
831     roundBits = zSig0 & roundMask;
832     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
833         if (    ( 0x7FFE < zExp )
834              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
835            ) {
836             goto overflow;
837         }
838         if ( zExp <= 0 ) {
839             if (status->flush_to_zero) {
840                 float_raise(float_flag_output_denormal, status);
841                 return packFloatx80(zSign, 0, 0);
842             }
843             isTiny =
844                    (status->float_detect_tininess
845                     == float_tininess_before_rounding)
846                 || ( zExp < 0 )
847                 || ( zSig0 <= zSig0 + roundIncrement );
848             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
849             zExp = 0;
850             roundBits = zSig0 & roundMask;
851             if (isTiny && roundBits) {
852                 float_raise(float_flag_underflow, status);
853             }
854             if (roundBits) {
855                 status->float_exception_flags |= float_flag_inexact;
856             }
857             zSig0 += roundIncrement;
858             if ( (int64_t) zSig0 < 0 ) zExp = 1;
859             roundIncrement = roundMask + 1;
860             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
861                 roundMask |= roundIncrement;
862             }
863             zSig0 &= ~ roundMask;
864             return packFloatx80( zSign, zExp, zSig0 );
865         }
866     }
867     if (roundBits) {
868         status->float_exception_flags |= float_flag_inexact;
869     }
870     zSig0 += roundIncrement;
871     if ( zSig0 < roundIncrement ) {
872         ++zExp;
873         zSig0 = LIT64( 0x8000000000000000 );
874     }
875     roundIncrement = roundMask + 1;
876     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
877         roundMask |= roundIncrement;
878     }
879     zSig0 &= ~ roundMask;
880     if ( zSig0 == 0 ) zExp = 0;
881     return packFloatx80( zSign, zExp, zSig0 );
882  precision80:
883     switch (roundingMode) {
884     case float_round_nearest_even:
885     case float_round_ties_away:
886         increment = ((int64_t)zSig1 < 0);
887         break;
888     case float_round_to_zero:
889         increment = 0;
890         break;
891     case float_round_up:
892         increment = !zSign && zSig1;
893         break;
894     case float_round_down:
895         increment = zSign && zSig1;
896         break;
897     default:
898         abort();
899     }
900     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
901         if (    ( 0x7FFE < zExp )
902              || (    ( zExp == 0x7FFE )
903                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
904                   && increment
905                 )
906            ) {
907             roundMask = 0;
908  overflow:
909             float_raise(float_flag_overflow | float_flag_inexact, status);
910             if (    ( roundingMode == float_round_to_zero )
911                  || ( zSign && ( roundingMode == float_round_up ) )
912                  || ( ! zSign && ( roundingMode == float_round_down ) )
913                ) {
914                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
915             }
916             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
917         }
918         if ( zExp <= 0 ) {
919             isTiny =
920                    (status->float_detect_tininess
921                     == float_tininess_before_rounding)
922                 || ( zExp < 0 )
923                 || ! increment
924                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
925             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
926             zExp = 0;
927             if (isTiny && zSig1) {
928                 float_raise(float_flag_underflow, status);
929             }
930             if (zSig1) {
931                 status->float_exception_flags |= float_flag_inexact;
932             }
933             switch (roundingMode) {
934             case float_round_nearest_even:
935             case float_round_ties_away:
936                 increment = ((int64_t)zSig1 < 0);
937                 break;
938             case float_round_to_zero:
939                 increment = 0;
940                 break;
941             case float_round_up:
942                 increment = !zSign && zSig1;
943                 break;
944             case float_round_down:
945                 increment = zSign && zSig1;
946                 break;
947             default:
948                 abort();
949             }
950             if ( increment ) {
951                 ++zSig0;
952                 zSig0 &=
953                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
954                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
955             }
956             return packFloatx80( zSign, zExp, zSig0 );
957         }
958     }
959     if (zSig1) {
960         status->float_exception_flags |= float_flag_inexact;
961     }
962     if ( increment ) {
963         ++zSig0;
964         if ( zSig0 == 0 ) {
965             ++zExp;
966             zSig0 = LIT64( 0x8000000000000000 );
967         }
968         else {
969             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
970         }
971     }
972     else {
973         if ( zSig0 == 0 ) zExp = 0;
974     }
975     return packFloatx80( zSign, zExp, zSig0 );
976 
977 }
978 
979 /*----------------------------------------------------------------------------
980 | Takes an abstract floating-point value having sign `zSign', exponent
981 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
982 | and returns the proper extended double-precision floating-point value
983 | corresponding to the abstract input.  This routine is just like
984 | `roundAndPackFloatx80' except that the input significand does not have to be
985 | normalized.
986 *----------------------------------------------------------------------------*/
987 
988 static floatx80 normalizeRoundAndPackFloatx80(int8_t roundingPrecision,
989                                               flag zSign, int32_t zExp,
990                                               uint64_t zSig0, uint64_t zSig1,
991                                               float_status *status)
992 {
993     int8_t shiftCount;
994 
995     if ( zSig0 == 0 ) {
996         zSig0 = zSig1;
997         zSig1 = 0;
998         zExp -= 64;
999     }
1000     shiftCount = countLeadingZeros64( zSig0 );
1001     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1002     zExp -= shiftCount;
1003     return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
1004                                 zSig0, zSig1, status);
1005 
1006 }
1007 
1008 /*----------------------------------------------------------------------------
1009 | Returns the least-significant 64 fraction bits of the quadruple-precision
1010 | floating-point value `a'.
1011 *----------------------------------------------------------------------------*/
1012 
1013 static inline uint64_t extractFloat128Frac1( float128 a )
1014 {
1015 
1016     return a.low;
1017 
1018 }
1019 
1020 /*----------------------------------------------------------------------------
1021 | Returns the most-significant 48 fraction bits of the quadruple-precision
1022 | floating-point value `a'.
1023 *----------------------------------------------------------------------------*/
1024 
1025 static inline uint64_t extractFloat128Frac0( float128 a )
1026 {
1027 
1028     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1029 
1030 }
1031 
1032 /*----------------------------------------------------------------------------
1033 | Returns the exponent bits of the quadruple-precision floating-point value
1034 | `a'.
1035 *----------------------------------------------------------------------------*/
1036 
1037 static inline int32_t extractFloat128Exp( float128 a )
1038 {
1039 
1040     return ( a.high>>48 ) & 0x7FFF;
1041 
1042 }
1043 
1044 /*----------------------------------------------------------------------------
1045 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1046 *----------------------------------------------------------------------------*/
1047 
1048 static inline flag extractFloat128Sign( float128 a )
1049 {
1050 
1051     return a.high>>63;
1052 
1053 }
1054 
1055 /*----------------------------------------------------------------------------
1056 | Normalizes the subnormal quadruple-precision floating-point value
1057 | represented by the denormalized significand formed by the concatenation of
1058 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
1059 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
1060 | significand are stored at the location pointed to by `zSig0Ptr', and the
1061 | least significant 64 bits of the normalized significand are stored at the
1062 | location pointed to by `zSig1Ptr'.
1063 *----------------------------------------------------------------------------*/
1064 
1065 static void
1066  normalizeFloat128Subnormal(
1067      uint64_t aSig0,
1068      uint64_t aSig1,
1069      int32_t *zExpPtr,
1070      uint64_t *zSig0Ptr,
1071      uint64_t *zSig1Ptr
1072  )
1073 {
1074     int8_t shiftCount;
1075 
1076     if ( aSig0 == 0 ) {
1077         shiftCount = countLeadingZeros64( aSig1 ) - 15;
1078         if ( shiftCount < 0 ) {
1079             *zSig0Ptr = aSig1>>( - shiftCount );
1080             *zSig1Ptr = aSig1<<( shiftCount & 63 );
1081         }
1082         else {
1083             *zSig0Ptr = aSig1<<shiftCount;
1084             *zSig1Ptr = 0;
1085         }
1086         *zExpPtr = - shiftCount - 63;
1087     }
1088     else {
1089         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1090         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1091         *zExpPtr = 1 - shiftCount;
1092     }
1093 
1094 }
1095 
1096 /*----------------------------------------------------------------------------
1097 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1098 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1099 | floating-point value, returning the result.  After being shifted into the
1100 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1101 | added together to form the most significant 32 bits of the result.  This
1102 | means that any integer portion of `zSig0' will be added into the exponent.
1103 | Since a properly normalized significand will have an integer portion equal
1104 | to 1, the `zExp' input should be 1 less than the desired result exponent
1105 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1106 | significand.
1107 *----------------------------------------------------------------------------*/
1108 
1109 static inline float128
1110  packFloat128( flag zSign, int32_t zExp, uint64_t zSig0, uint64_t zSig1 )
1111 {
1112     float128 z;
1113 
1114     z.low = zSig1;
1115     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1116     return z;
1117 
1118 }
1119 
1120 /*----------------------------------------------------------------------------
1121 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1122 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1123 | and `zSig2', and returns the proper quadruple-precision floating-point value
1124 | corresponding to the abstract input.  Ordinarily, the abstract value is
1125 | simply rounded and packed into the quadruple-precision format, with the
1126 | inexact exception raised if the abstract input cannot be represented
1127 | exactly.  However, if the abstract value is too large, the overflow and
1128 | inexact exceptions are raised and an infinity or maximal finite value is
1129 | returned.  If the abstract value is too small, the input value is rounded to
1130 | a subnormal number, and the underflow and inexact exceptions are raised if
1131 | the abstract input cannot be represented exactly as a subnormal quadruple-
1132 | precision floating-point number.
1133 |     The input significand must be normalized or smaller.  If the input
1134 | significand is not normalized, `zExp' must be 0; in that case, the result
1135 | returned is a subnormal number, and it must not require rounding.  In the
1136 | usual case that the input significand is normalized, `zExp' must be 1 less
1137 | than the ``true'' floating-point exponent.  The handling of underflow and
1138 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1140 
1141 static float128 roundAndPackFloat128(flag zSign, int32_t zExp,
1142                                      uint64_t zSig0, uint64_t zSig1,
1143                                      uint64_t zSig2, float_status *status)
1144 {
1145     int8_t roundingMode;
1146     flag roundNearestEven, increment, isTiny;
1147 
1148     roundingMode = status->float_rounding_mode;
1149     roundNearestEven = ( roundingMode == float_round_nearest_even );
1150     switch (roundingMode) {
1151     case float_round_nearest_even:
1152     case float_round_ties_away:
1153         increment = ((int64_t)zSig2 < 0);
1154         break;
1155     case float_round_to_zero:
1156         increment = 0;
1157         break;
1158     case float_round_up:
1159         increment = !zSign && zSig2;
1160         break;
1161     case float_round_down:
1162         increment = zSign && zSig2;
1163         break;
1164     case float_round_to_odd:
1165         increment = !(zSig1 & 0x1) && zSig2;
1166         break;
1167     default:
1168         abort();
1169     }
1170     if ( 0x7FFD <= (uint32_t) zExp ) {
1171         if (    ( 0x7FFD < zExp )
1172              || (    ( zExp == 0x7FFD )
1173                   && eq128(
1174                          LIT64( 0x0001FFFFFFFFFFFF ),
1175                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1176                          zSig0,
1177                          zSig1
1178                      )
1179                   && increment
1180                 )
1181            ) {
1182             float_raise(float_flag_overflow | float_flag_inexact, status);
1183             if (    ( roundingMode == float_round_to_zero )
1184                  || ( zSign && ( roundingMode == float_round_up ) )
1185                  || ( ! zSign && ( roundingMode == float_round_down ) )
1186                  || (roundingMode == float_round_to_odd)
1187                ) {
1188                 return
1189                     packFloat128(
1190                         zSign,
1191                         0x7FFE,
1192                         LIT64( 0x0000FFFFFFFFFFFF ),
1193                         LIT64( 0xFFFFFFFFFFFFFFFF )
1194                     );
1195             }
1196             return packFloat128( zSign, 0x7FFF, 0, 0 );
1197         }
1198         if ( zExp < 0 ) {
1199             if (status->flush_to_zero) {
1200                 float_raise(float_flag_output_denormal, status);
1201                 return packFloat128(zSign, 0, 0, 0);
1202             }
1203             isTiny =
1204                    (status->float_detect_tininess
1205                     == float_tininess_before_rounding)
1206                 || ( zExp < -1 )
1207                 || ! increment
1208                 || lt128(
1209                        zSig0,
1210                        zSig1,
1211                        LIT64( 0x0001FFFFFFFFFFFF ),
1212                        LIT64( 0xFFFFFFFFFFFFFFFF )
1213                    );
1214             shift128ExtraRightJamming(
1215                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1216             zExp = 0;
1217             if (isTiny && zSig2) {
1218                 float_raise(float_flag_underflow, status);
1219             }
1220             switch (roundingMode) {
1221             case float_round_nearest_even:
1222             case float_round_ties_away:
1223                 increment = ((int64_t)zSig2 < 0);
1224                 break;
1225             case float_round_to_zero:
1226                 increment = 0;
1227                 break;
1228             case float_round_up:
1229                 increment = !zSign && zSig2;
1230                 break;
1231             case float_round_down:
1232                 increment = zSign && zSig2;
1233                 break;
1234             case float_round_to_odd:
1235                 increment = !(zSig1 & 0x1) && zSig2;
1236                 break;
1237             default:
1238                 abort();
1239             }
1240         }
1241     }
1242     if (zSig2) {
1243         status->float_exception_flags |= float_flag_inexact;
1244     }
1245     if ( increment ) {
1246         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1247         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1248     }
1249     else {
1250         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1251     }
1252     return packFloat128( zSign, zExp, zSig0, zSig1 );
1253 
1254 }
1255 
1256 /*----------------------------------------------------------------------------
1257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1258 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1259 | returns the proper quadruple-precision floating-point value corresponding
1260 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1261 | except that the input significand has fewer bits and does not have to be
1262 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1263 | point exponent.
1264 *----------------------------------------------------------------------------*/
1265 
1266 static float128 normalizeRoundAndPackFloat128(flag zSign, int32_t zExp,
1267                                               uint64_t zSig0, uint64_t zSig1,
1268                                               float_status *status)
1269 {
1270     int8_t shiftCount;
1271     uint64_t zSig2;
1272 
1273     if ( zSig0 == 0 ) {
1274         zSig0 = zSig1;
1275         zSig1 = 0;
1276         zExp -= 64;
1277     }
1278     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1279     if ( 0 <= shiftCount ) {
1280         zSig2 = 0;
1281         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1282     }
1283     else {
1284         shift128ExtraRightJamming(
1285             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1286     }
1287     zExp -= shiftCount;
1288     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1289 
1290 }
1291 
1292 /*----------------------------------------------------------------------------
1293 | Returns the result of converting the 32-bit two's complement integer `a'
1294 | to the single-precision floating-point format.  The conversion is performed
1295 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1296 *----------------------------------------------------------------------------*/
1297 
1298 float32 int32_to_float32(int32_t a, float_status *status)
1299 {
1300     flag zSign;
1301 
1302     if ( a == 0 ) return float32_zero;
1303     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1304     zSign = ( a < 0 );
1305     return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1306 }
1307 
1308 /*----------------------------------------------------------------------------
1309 | Returns the result of converting the 32-bit two's complement integer `a'
1310 | to the double-precision floating-point format.  The conversion is performed
1311 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1312 *----------------------------------------------------------------------------*/
1313 
1314 float64 int32_to_float64(int32_t a, float_status *status)
1315 {
1316     flag zSign;
1317     uint32_t absA;
1318     int8_t shiftCount;
1319     uint64_t zSig;
1320 
1321     if ( a == 0 ) return float64_zero;
1322     zSign = ( a < 0 );
1323     absA = zSign ? - a : a;
1324     shiftCount = countLeadingZeros32( absA ) + 21;
1325     zSig = absA;
1326     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1327 
1328 }
1329 
1330 /*----------------------------------------------------------------------------
1331 | Returns the result of converting the 32-bit two's complement integer `a'
1332 | to the extended double-precision floating-point format.  The conversion
1333 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1334 | Arithmetic.
1335 *----------------------------------------------------------------------------*/
1336 
1337 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1338 {
1339     flag zSign;
1340     uint32_t absA;
1341     int8_t shiftCount;
1342     uint64_t zSig;
1343 
1344     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1345     zSign = ( a < 0 );
1346     absA = zSign ? - a : a;
1347     shiftCount = countLeadingZeros32( absA ) + 32;
1348     zSig = absA;
1349     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1350 
1351 }
1352 
1353 /*----------------------------------------------------------------------------
1354 | Returns the result of converting the 32-bit two's complement integer `a' to
1355 | the quadruple-precision floating-point format.  The conversion is performed
1356 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1357 *----------------------------------------------------------------------------*/
1358 
1359 float128 int32_to_float128(int32_t a, float_status *status)
1360 {
1361     flag zSign;
1362     uint32_t absA;
1363     int8_t shiftCount;
1364     uint64_t zSig0;
1365 
1366     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1367     zSign = ( a < 0 );
1368     absA = zSign ? - a : a;
1369     shiftCount = countLeadingZeros32( absA ) + 17;
1370     zSig0 = absA;
1371     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1372 
1373 }
1374 
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the 64-bit two's complement integer `a'
1377 | to the single-precision floating-point format.  The conversion is performed
1378 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1379 *----------------------------------------------------------------------------*/
1380 
1381 float32 int64_to_float32(int64_t a, float_status *status)
1382 {
1383     flag zSign;
1384     uint64_t absA;
1385     int8_t shiftCount;
1386 
1387     if ( a == 0 ) return float32_zero;
1388     zSign = ( a < 0 );
1389     absA = zSign ? - a : a;
1390     shiftCount = countLeadingZeros64( absA ) - 40;
1391     if ( 0 <= shiftCount ) {
1392         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1393     }
1394     else {
1395         shiftCount += 7;
1396         if ( shiftCount < 0 ) {
1397             shift64RightJamming( absA, - shiftCount, &absA );
1398         }
1399         else {
1400             absA <<= shiftCount;
1401         }
1402         return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1403     }
1404 
1405 }
1406 
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the double-precision floating-point format.  The conversion is performed
1410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1411 *----------------------------------------------------------------------------*/
1412 
1413 float64 int64_to_float64(int64_t a, float_status *status)
1414 {
1415     flag zSign;
1416 
1417     if ( a == 0 ) return float64_zero;
1418     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1419         return packFloat64( 1, 0x43E, 0 );
1420     }
1421     zSign = ( a < 0 );
1422     return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1423 }
1424 
1425 /*----------------------------------------------------------------------------
1426 | Returns the result of converting the 64-bit two's complement integer `a'
1427 | to the extended double-precision floating-point format.  The conversion
1428 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1429 | Arithmetic.
1430 *----------------------------------------------------------------------------*/
1431 
1432 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1433 {
1434     flag zSign;
1435     uint64_t absA;
1436     int8_t shiftCount;
1437 
1438     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1439     zSign = ( a < 0 );
1440     absA = zSign ? - a : a;
1441     shiftCount = countLeadingZeros64( absA );
1442     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1443 
1444 }
1445 
1446 /*----------------------------------------------------------------------------
1447 | Returns the result of converting the 64-bit two's complement integer `a' to
1448 | the quadruple-precision floating-point format.  The conversion is performed
1449 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1450 *----------------------------------------------------------------------------*/
1451 
1452 float128 int64_to_float128(int64_t a, float_status *status)
1453 {
1454     flag zSign;
1455     uint64_t absA;
1456     int8_t shiftCount;
1457     int32_t zExp;
1458     uint64_t zSig0, zSig1;
1459 
1460     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1461     zSign = ( a < 0 );
1462     absA = zSign ? - a : a;
1463     shiftCount = countLeadingZeros64( absA ) + 49;
1464     zExp = 0x406E - shiftCount;
1465     if ( 64 <= shiftCount ) {
1466         zSig1 = 0;
1467         zSig0 = absA;
1468         shiftCount -= 64;
1469     }
1470     else {
1471         zSig1 = absA;
1472         zSig0 = 0;
1473     }
1474     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1475     return packFloat128( zSign, zExp, zSig0, zSig1 );
1476 
1477 }
1478 
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the 64-bit unsigned integer `a'
1481 | to the single-precision floating-point format.  The conversion is performed
1482 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1483 *----------------------------------------------------------------------------*/
1484 
1485 float32 uint64_to_float32(uint64_t a, float_status *status)
1486 {
1487     int shiftcount;
1488 
1489     if (a == 0) {
1490         return float32_zero;
1491     }
1492 
1493     /* Determine (left) shift needed to put first set bit into bit posn 23
1494      * (since packFloat32() expects the binary point between bits 23 and 22);
1495      * this is the fast case for smallish numbers.
1496      */
1497     shiftcount = countLeadingZeros64(a) - 40;
1498     if (shiftcount >= 0) {
1499         return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1500     }
1501     /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1502      * expects the binary point between bits 30 and 29, hence the + 7.
1503      */
1504     shiftcount += 7;
1505     if (shiftcount < 0) {
1506         shift64RightJamming(a, -shiftcount, &a);
1507     } else {
1508         a <<= shiftcount;
1509     }
1510 
1511     return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1512 }
1513 
1514 /*----------------------------------------------------------------------------
1515 | Returns the result of converting the 64-bit unsigned integer `a'
1516 | to the double-precision floating-point format.  The conversion is performed
1517 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1518 *----------------------------------------------------------------------------*/
1519 
1520 float64 uint64_to_float64(uint64_t a, float_status *status)
1521 {
1522     int exp = 0x43C;
1523     int shiftcount;
1524 
1525     if (a == 0) {
1526         return float64_zero;
1527     }
1528 
1529     shiftcount = countLeadingZeros64(a) - 1;
1530     if (shiftcount < 0) {
1531         shift64RightJamming(a, -shiftcount, &a);
1532     } else {
1533         a <<= shiftcount;
1534     }
1535     return roundAndPackFloat64(0, exp - shiftcount, a, status);
1536 }
1537 
1538 /*----------------------------------------------------------------------------
1539 | Returns the result of converting the 64-bit unsigned integer `a'
1540 | to the quadruple-precision floating-point format.  The conversion is performed
1541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1542 *----------------------------------------------------------------------------*/
1543 
1544 float128 uint64_to_float128(uint64_t a, float_status *status)
1545 {
1546     if (a == 0) {
1547         return float128_zero;
1548     }
1549     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1550 }
1551 
1552 /*----------------------------------------------------------------------------
1553 | Returns the result of converting the single-precision floating-point value
1554 | `a' to the 32-bit two's complement integer format.  The conversion is
1555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1556 | Arithmetic---which means in particular that the conversion is rounded
1557 | according to the current rounding mode.  If `a' is a NaN, the largest
1558 | positive integer is returned.  Otherwise, if the conversion overflows, the
1559 | largest integer with the same sign as `a' is returned.
1560 *----------------------------------------------------------------------------*/
1561 
1562 int32_t float32_to_int32(float32 a, float_status *status)
1563 {
1564     flag aSign;
1565     int aExp;
1566     int shiftCount;
1567     uint32_t aSig;
1568     uint64_t aSig64;
1569 
1570     a = float32_squash_input_denormal(a, status);
1571     aSig = extractFloat32Frac( a );
1572     aExp = extractFloat32Exp( a );
1573     aSign = extractFloat32Sign( a );
1574     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1575     if ( aExp ) aSig |= 0x00800000;
1576     shiftCount = 0xAF - aExp;
1577     aSig64 = aSig;
1578     aSig64 <<= 32;
1579     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1580     return roundAndPackInt32(aSign, aSig64, status);
1581 
1582 }
1583 
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of converting the single-precision floating-point value
1586 | `a' to the 32-bit two's complement integer format.  The conversion is
1587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1588 | Arithmetic, except that the conversion is always rounded toward zero.
1589 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1590 | the conversion overflows, the largest integer with the same sign as `a' is
1591 | returned.
1592 *----------------------------------------------------------------------------*/
1593 
1594 int32_t float32_to_int32_round_to_zero(float32 a, float_status *status)
1595 {
1596     flag aSign;
1597     int aExp;
1598     int shiftCount;
1599     uint32_t aSig;
1600     int32_t z;
1601     a = float32_squash_input_denormal(a, status);
1602 
1603     aSig = extractFloat32Frac( a );
1604     aExp = extractFloat32Exp( a );
1605     aSign = extractFloat32Sign( a );
1606     shiftCount = aExp - 0x9E;
1607     if ( 0 <= shiftCount ) {
1608         if ( float32_val(a) != 0xCF000000 ) {
1609             float_raise(float_flag_invalid, status);
1610             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1611         }
1612         return (int32_t) 0x80000000;
1613     }
1614     else if ( aExp <= 0x7E ) {
1615         if (aExp | aSig) {
1616             status->float_exception_flags |= float_flag_inexact;
1617         }
1618         return 0;
1619     }
1620     aSig = ( aSig | 0x00800000 )<<8;
1621     z = aSig>>( - shiftCount );
1622     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1623         status->float_exception_flags |= float_flag_inexact;
1624     }
1625     if ( aSign ) z = - z;
1626     return z;
1627 
1628 }
1629 
1630 /*----------------------------------------------------------------------------
1631 | Returns the result of converting the single-precision floating-point value
1632 | `a' to the 16-bit two's complement integer format.  The conversion is
1633 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 | Arithmetic, except that the conversion is always rounded toward zero.
1635 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1636 | the conversion overflows, the largest integer with the same sign as `a' is
1637 | returned.
1638 *----------------------------------------------------------------------------*/
1639 
1640 int16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1641 {
1642     flag aSign;
1643     int aExp;
1644     int shiftCount;
1645     uint32_t aSig;
1646     int32_t z;
1647 
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0x8E;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xC7000000 ) {
1654             float_raise(float_flag_invalid, status);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return 0x7FFF;
1657             }
1658         }
1659         return (int32_t) 0xffff8000;
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) {
1663             status->float_exception_flags |= float_flag_inexact;
1664         }
1665         return 0;
1666     }
1667     shiftCount -= 0x10;
1668     aSig = ( aSig | 0x00800000 )<<8;
1669     z = aSig>>( - shiftCount );
1670     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1671         status->float_exception_flags |= float_flag_inexact;
1672     }
1673     if ( aSign ) {
1674         z = - z;
1675     }
1676     return z;
1677 
1678 }
1679 
1680 /*----------------------------------------------------------------------------
1681 | Returns the result of converting the single-precision floating-point value
1682 | `a' to the 64-bit two's complement integer format.  The conversion is
1683 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1684 | Arithmetic---which means in particular that the conversion is rounded
1685 | according to the current rounding mode.  If `a' is a NaN, the largest
1686 | positive integer is returned.  Otherwise, if the conversion overflows, the
1687 | largest integer with the same sign as `a' is returned.
1688 *----------------------------------------------------------------------------*/
1689 
1690 int64_t float32_to_int64(float32 a, float_status *status)
1691 {
1692     flag aSign;
1693     int aExp;
1694     int shiftCount;
1695     uint32_t aSig;
1696     uint64_t aSig64, aSigExtra;
1697     a = float32_squash_input_denormal(a, status);
1698 
1699     aSig = extractFloat32Frac( a );
1700     aExp = extractFloat32Exp( a );
1701     aSign = extractFloat32Sign( a );
1702     shiftCount = 0xBE - aExp;
1703     if ( shiftCount < 0 ) {
1704         float_raise(float_flag_invalid, status);
1705         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1706             return LIT64( 0x7FFFFFFFFFFFFFFF );
1707         }
1708         return (int64_t) LIT64( 0x8000000000000000 );
1709     }
1710     if ( aExp ) aSig |= 0x00800000;
1711     aSig64 = aSig;
1712     aSig64 <<= 40;
1713     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1714     return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1715 
1716 }
1717 
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of converting the single-precision floating-point value
1720 | `a' to the 64-bit unsigned integer format.  The conversion is
1721 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1722 | Arithmetic---which means in particular that the conversion is rounded
1723 | according to the current rounding mode.  If `a' is a NaN, the largest
1724 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1725 | largest unsigned integer is returned.  If the 'a' is negative, the result
1726 | is rounded and zero is returned; values that do not round to zero will
1727 | raise the inexact exception flag.
1728 *----------------------------------------------------------------------------*/
1729 
1730 uint64_t float32_to_uint64(float32 a, float_status *status)
1731 {
1732     flag aSign;
1733     int aExp;
1734     int shiftCount;
1735     uint32_t aSig;
1736     uint64_t aSig64, aSigExtra;
1737     a = float32_squash_input_denormal(a, status);
1738 
1739     aSig = extractFloat32Frac(a);
1740     aExp = extractFloat32Exp(a);
1741     aSign = extractFloat32Sign(a);
1742     if ((aSign) && (aExp > 126)) {
1743         float_raise(float_flag_invalid, status);
1744         if (float32_is_any_nan(a)) {
1745             return LIT64(0xFFFFFFFFFFFFFFFF);
1746         } else {
1747             return 0;
1748         }
1749     }
1750     shiftCount = 0xBE - aExp;
1751     if (aExp) {
1752         aSig |= 0x00800000;
1753     }
1754     if (shiftCount < 0) {
1755         float_raise(float_flag_invalid, status);
1756         return LIT64(0xFFFFFFFFFFFFFFFF);
1757     }
1758 
1759     aSig64 = aSig;
1760     aSig64 <<= 40;
1761     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1762     return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1763 }
1764 
1765 /*----------------------------------------------------------------------------
1766 | Returns the result of converting the single-precision floating-point value
1767 | `a' to the 64-bit unsigned integer format.  The conversion is
1768 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1769 | Arithmetic, except that the conversion is always rounded toward zero.  If
1770 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1771 | conversion overflows, the largest unsigned integer is returned.  If the
1772 | 'a' is negative, the result is rounded and zero is returned; values that do
1773 | not round to zero will raise the inexact flag.
1774 *----------------------------------------------------------------------------*/
1775 
1776 uint64_t float32_to_uint64_round_to_zero(float32 a, float_status *status)
1777 {
1778     signed char current_rounding_mode = status->float_rounding_mode;
1779     set_float_rounding_mode(float_round_to_zero, status);
1780     int64_t v = float32_to_uint64(a, status);
1781     set_float_rounding_mode(current_rounding_mode, status);
1782     return v;
1783 }
1784 
1785 /*----------------------------------------------------------------------------
1786 | Returns the result of converting the single-precision floating-point value
1787 | `a' to the 64-bit two's complement integer format.  The conversion is
1788 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1789 | Arithmetic, except that the conversion is always rounded toward zero.  If
1790 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1791 | conversion overflows, the largest integer with the same sign as `a' is
1792 | returned.
1793 *----------------------------------------------------------------------------*/
1794 
1795 int64_t float32_to_int64_round_to_zero(float32 a, float_status *status)
1796 {
1797     flag aSign;
1798     int aExp;
1799     int shiftCount;
1800     uint32_t aSig;
1801     uint64_t aSig64;
1802     int64_t z;
1803     a = float32_squash_input_denormal(a, status);
1804 
1805     aSig = extractFloat32Frac( a );
1806     aExp = extractFloat32Exp( a );
1807     aSign = extractFloat32Sign( a );
1808     shiftCount = aExp - 0xBE;
1809     if ( 0 <= shiftCount ) {
1810         if ( float32_val(a) != 0xDF000000 ) {
1811             float_raise(float_flag_invalid, status);
1812             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1813                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1814             }
1815         }
1816         return (int64_t) LIT64( 0x8000000000000000 );
1817     }
1818     else if ( aExp <= 0x7E ) {
1819         if (aExp | aSig) {
1820             status->float_exception_flags |= float_flag_inexact;
1821         }
1822         return 0;
1823     }
1824     aSig64 = aSig | 0x00800000;
1825     aSig64 <<= 40;
1826     z = aSig64>>( - shiftCount );
1827     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1828         status->float_exception_flags |= float_flag_inexact;
1829     }
1830     if ( aSign ) z = - z;
1831     return z;
1832 
1833 }
1834 
1835 /*----------------------------------------------------------------------------
1836 | Returns the result of converting the single-precision floating-point value
1837 | `a' to the double-precision floating-point format.  The conversion is
1838 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1839 | Arithmetic.
1840 *----------------------------------------------------------------------------*/
1841 
1842 float64 float32_to_float64(float32 a, float_status *status)
1843 {
1844     flag aSign;
1845     int aExp;
1846     uint32_t aSig;
1847     a = float32_squash_input_denormal(a, status);
1848 
1849     aSig = extractFloat32Frac( a );
1850     aExp = extractFloat32Exp( a );
1851     aSign = extractFloat32Sign( a );
1852     if ( aExp == 0xFF ) {
1853         if (aSig) {
1854             return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1855         }
1856         return packFloat64( aSign, 0x7FF, 0 );
1857     }
1858     if ( aExp == 0 ) {
1859         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1860         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1861         --aExp;
1862     }
1863     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1864 
1865 }
1866 
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of converting the single-precision floating-point value
1869 | `a' to the extended double-precision floating-point format.  The conversion
1870 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1871 | Arithmetic.
1872 *----------------------------------------------------------------------------*/
1873 
1874 floatx80 float32_to_floatx80(float32 a, float_status *status)
1875 {
1876     flag aSign;
1877     int aExp;
1878     uint32_t aSig;
1879 
1880     a = float32_squash_input_denormal(a, status);
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     aSign = extractFloat32Sign( a );
1884     if ( aExp == 0xFF ) {
1885         if (aSig) {
1886             return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1887         }
1888         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1889     }
1890     if ( aExp == 0 ) {
1891         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1892         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1893     }
1894     aSig |= 0x00800000;
1895     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1896 
1897 }
1898 
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of converting the single-precision floating-point value
1901 | `a' to the double-precision floating-point format.  The conversion is
1902 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1903 | Arithmetic.
1904 *----------------------------------------------------------------------------*/
1905 
1906 float128 float32_to_float128(float32 a, float_status *status)
1907 {
1908     flag aSign;
1909     int aExp;
1910     uint32_t aSig;
1911 
1912     a = float32_squash_input_denormal(a, status);
1913     aSig = extractFloat32Frac( a );
1914     aExp = extractFloat32Exp( a );
1915     aSign = extractFloat32Sign( a );
1916     if ( aExp == 0xFF ) {
1917         if (aSig) {
1918             return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1919         }
1920         return packFloat128( aSign, 0x7FFF, 0, 0 );
1921     }
1922     if ( aExp == 0 ) {
1923         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1924         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1925         --aExp;
1926     }
1927     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1928 
1929 }
1930 
1931 /*----------------------------------------------------------------------------
1932 | Rounds the single-precision floating-point value `a' to an integer, and
1933 | returns the result as a single-precision floating-point value.  The
1934 | operation is performed according to the IEC/IEEE Standard for Binary
1935 | Floating-Point Arithmetic.
1936 *----------------------------------------------------------------------------*/
1937 
1938 float32 float32_round_to_int(float32 a, float_status *status)
1939 {
1940     flag aSign;
1941     int aExp;
1942     uint32_t lastBitMask, roundBitsMask;
1943     uint32_t z;
1944     a = float32_squash_input_denormal(a, status);
1945 
1946     aExp = extractFloat32Exp( a );
1947     if ( 0x96 <= aExp ) {
1948         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1949             return propagateFloat32NaN(a, a, status);
1950         }
1951         return a;
1952     }
1953     if ( aExp <= 0x7E ) {
1954         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1955         status->float_exception_flags |= float_flag_inexact;
1956         aSign = extractFloat32Sign( a );
1957         switch (status->float_rounding_mode) {
1958          case float_round_nearest_even:
1959             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1960                 return packFloat32( aSign, 0x7F, 0 );
1961             }
1962             break;
1963         case float_round_ties_away:
1964             if (aExp == 0x7E) {
1965                 return packFloat32(aSign, 0x7F, 0);
1966             }
1967             break;
1968          case float_round_down:
1969             return make_float32(aSign ? 0xBF800000 : 0);
1970          case float_round_up:
1971             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1972         }
1973         return packFloat32( aSign, 0, 0 );
1974     }
1975     lastBitMask = 1;
1976     lastBitMask <<= 0x96 - aExp;
1977     roundBitsMask = lastBitMask - 1;
1978     z = float32_val(a);
1979     switch (status->float_rounding_mode) {
1980     case float_round_nearest_even:
1981         z += lastBitMask>>1;
1982         if ((z & roundBitsMask) == 0) {
1983             z &= ~lastBitMask;
1984         }
1985         break;
1986     case float_round_ties_away:
1987         z += lastBitMask >> 1;
1988         break;
1989     case float_round_to_zero:
1990         break;
1991     case float_round_up:
1992         if (!extractFloat32Sign(make_float32(z))) {
1993             z += roundBitsMask;
1994         }
1995         break;
1996     case float_round_down:
1997         if (extractFloat32Sign(make_float32(z))) {
1998             z += roundBitsMask;
1999         }
2000         break;
2001     default:
2002         abort();
2003     }
2004     z &= ~ roundBitsMask;
2005     if (z != float32_val(a)) {
2006         status->float_exception_flags |= float_flag_inexact;
2007     }
2008     return make_float32(z);
2009 
2010 }
2011 
2012 /*----------------------------------------------------------------------------
2013 | Returns the result of adding the absolute values of the single-precision
2014 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2015 | before being returned.  `zSign' is ignored if the result is a NaN.
2016 | The addition is performed according to the IEC/IEEE Standard for Binary
2017 | Floating-Point Arithmetic.
2018 *----------------------------------------------------------------------------*/
2019 
2020 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
2021                               float_status *status)
2022 {
2023     int aExp, bExp, zExp;
2024     uint32_t aSig, bSig, zSig;
2025     int expDiff;
2026 
2027     aSig = extractFloat32Frac( a );
2028     aExp = extractFloat32Exp( a );
2029     bSig = extractFloat32Frac( b );
2030     bExp = extractFloat32Exp( b );
2031     expDiff = aExp - bExp;
2032     aSig <<= 6;
2033     bSig <<= 6;
2034     if ( 0 < expDiff ) {
2035         if ( aExp == 0xFF ) {
2036             if (aSig) {
2037                 return propagateFloat32NaN(a, b, status);
2038             }
2039             return a;
2040         }
2041         if ( bExp == 0 ) {
2042             --expDiff;
2043         }
2044         else {
2045             bSig |= 0x20000000;
2046         }
2047         shift32RightJamming( bSig, expDiff, &bSig );
2048         zExp = aExp;
2049     }
2050     else if ( expDiff < 0 ) {
2051         if ( bExp == 0xFF ) {
2052             if (bSig) {
2053                 return propagateFloat32NaN(a, b, status);
2054             }
2055             return packFloat32( zSign, 0xFF, 0 );
2056         }
2057         if ( aExp == 0 ) {
2058             ++expDiff;
2059         }
2060         else {
2061             aSig |= 0x20000000;
2062         }
2063         shift32RightJamming( aSig, - expDiff, &aSig );
2064         zExp = bExp;
2065     }
2066     else {
2067         if ( aExp == 0xFF ) {
2068             if (aSig | bSig) {
2069                 return propagateFloat32NaN(a, b, status);
2070             }
2071             return a;
2072         }
2073         if ( aExp == 0 ) {
2074             if (status->flush_to_zero) {
2075                 if (aSig | bSig) {
2076                     float_raise(float_flag_output_denormal, status);
2077                 }
2078                 return packFloat32(zSign, 0, 0);
2079             }
2080             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2081         }
2082         zSig = 0x40000000 + aSig + bSig;
2083         zExp = aExp;
2084         goto roundAndPack;
2085     }
2086     aSig |= 0x20000000;
2087     zSig = ( aSig + bSig )<<1;
2088     --zExp;
2089     if ( (int32_t) zSig < 0 ) {
2090         zSig = aSig + bSig;
2091         ++zExp;
2092     }
2093  roundAndPack:
2094     return roundAndPackFloat32(zSign, zExp, zSig, status);
2095 
2096 }
2097 
2098 /*----------------------------------------------------------------------------
2099 | Returns the result of subtracting the absolute values of the single-
2100 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2101 | difference is negated before being returned.  `zSign' is ignored if the
2102 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2103 | Standard for Binary Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105 
2106 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2107                               float_status *status)
2108 {
2109     int aExp, bExp, zExp;
2110     uint32_t aSig, bSig, zSig;
2111     int expDiff;
2112 
2113     aSig = extractFloat32Frac( a );
2114     aExp = extractFloat32Exp( a );
2115     bSig = extractFloat32Frac( b );
2116     bExp = extractFloat32Exp( b );
2117     expDiff = aExp - bExp;
2118     aSig <<= 7;
2119     bSig <<= 7;
2120     if ( 0 < expDiff ) goto aExpBigger;
2121     if ( expDiff < 0 ) goto bExpBigger;
2122     if ( aExp == 0xFF ) {
2123         if (aSig | bSig) {
2124             return propagateFloat32NaN(a, b, status);
2125         }
2126         float_raise(float_flag_invalid, status);
2127         return float32_default_nan(status);
2128     }
2129     if ( aExp == 0 ) {
2130         aExp = 1;
2131         bExp = 1;
2132     }
2133     if ( bSig < aSig ) goto aBigger;
2134     if ( aSig < bSig ) goto bBigger;
2135     return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2136  bExpBigger:
2137     if ( bExp == 0xFF ) {
2138         if (bSig) {
2139             return propagateFloat32NaN(a, b, status);
2140         }
2141         return packFloat32( zSign ^ 1, 0xFF, 0 );
2142     }
2143     if ( aExp == 0 ) {
2144         ++expDiff;
2145     }
2146     else {
2147         aSig |= 0x40000000;
2148     }
2149     shift32RightJamming( aSig, - expDiff, &aSig );
2150     bSig |= 0x40000000;
2151  bBigger:
2152     zSig = bSig - aSig;
2153     zExp = bExp;
2154     zSign ^= 1;
2155     goto normalizeRoundAndPack;
2156  aExpBigger:
2157     if ( aExp == 0xFF ) {
2158         if (aSig) {
2159             return propagateFloat32NaN(a, b, status);
2160         }
2161         return a;
2162     }
2163     if ( bExp == 0 ) {
2164         --expDiff;
2165     }
2166     else {
2167         bSig |= 0x40000000;
2168     }
2169     shift32RightJamming( bSig, expDiff, &bSig );
2170     aSig |= 0x40000000;
2171  aBigger:
2172     zSig = aSig - bSig;
2173     zExp = aExp;
2174  normalizeRoundAndPack:
2175     --zExp;
2176     return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2177 
2178 }
2179 
2180 /*----------------------------------------------------------------------------
2181 | Returns the result of adding the single-precision floating-point values `a'
2182 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2183 | Binary Floating-Point Arithmetic.
2184 *----------------------------------------------------------------------------*/
2185 
2186 float32 float32_add(float32 a, float32 b, float_status *status)
2187 {
2188     flag aSign, bSign;
2189     a = float32_squash_input_denormal(a, status);
2190     b = float32_squash_input_denormal(b, status);
2191 
2192     aSign = extractFloat32Sign( a );
2193     bSign = extractFloat32Sign( b );
2194     if ( aSign == bSign ) {
2195         return addFloat32Sigs(a, b, aSign, status);
2196     }
2197     else {
2198         return subFloat32Sigs(a, b, aSign, status);
2199     }
2200 
2201 }
2202 
2203 /*----------------------------------------------------------------------------
2204 | Returns the result of subtracting the single-precision floating-point values
2205 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2206 | for Binary Floating-Point Arithmetic.
2207 *----------------------------------------------------------------------------*/
2208 
2209 float32 float32_sub(float32 a, float32 b, float_status *status)
2210 {
2211     flag aSign, bSign;
2212     a = float32_squash_input_denormal(a, status);
2213     b = float32_squash_input_denormal(b, status);
2214 
2215     aSign = extractFloat32Sign( a );
2216     bSign = extractFloat32Sign( b );
2217     if ( aSign == bSign ) {
2218         return subFloat32Sigs(a, b, aSign, status);
2219     }
2220     else {
2221         return addFloat32Sigs(a, b, aSign, status);
2222     }
2223 
2224 }
2225 
2226 /*----------------------------------------------------------------------------
2227 | Returns the result of multiplying the single-precision floating-point values
2228 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2229 | for Binary Floating-Point Arithmetic.
2230 *----------------------------------------------------------------------------*/
2231 
2232 float32 float32_mul(float32 a, float32 b, float_status *status)
2233 {
2234     flag aSign, bSign, zSign;
2235     int aExp, bExp, zExp;
2236     uint32_t aSig, bSig;
2237     uint64_t zSig64;
2238     uint32_t zSig;
2239 
2240     a = float32_squash_input_denormal(a, status);
2241     b = float32_squash_input_denormal(b, status);
2242 
2243     aSig = extractFloat32Frac( a );
2244     aExp = extractFloat32Exp( a );
2245     aSign = extractFloat32Sign( a );
2246     bSig = extractFloat32Frac( b );
2247     bExp = extractFloat32Exp( b );
2248     bSign = extractFloat32Sign( b );
2249     zSign = aSign ^ bSign;
2250     if ( aExp == 0xFF ) {
2251         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2252             return propagateFloat32NaN(a, b, status);
2253         }
2254         if ( ( bExp | bSig ) == 0 ) {
2255             float_raise(float_flag_invalid, status);
2256             return float32_default_nan(status);
2257         }
2258         return packFloat32( zSign, 0xFF, 0 );
2259     }
2260     if ( bExp == 0xFF ) {
2261         if (bSig) {
2262             return propagateFloat32NaN(a, b, status);
2263         }
2264         if ( ( aExp | aSig ) == 0 ) {
2265             float_raise(float_flag_invalid, status);
2266             return float32_default_nan(status);
2267         }
2268         return packFloat32( zSign, 0xFF, 0 );
2269     }
2270     if ( aExp == 0 ) {
2271         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2272         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2273     }
2274     if ( bExp == 0 ) {
2275         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2276         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2277     }
2278     zExp = aExp + bExp - 0x7F;
2279     aSig = ( aSig | 0x00800000 )<<7;
2280     bSig = ( bSig | 0x00800000 )<<8;
2281     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2282     zSig = zSig64;
2283     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2284         zSig <<= 1;
2285         --zExp;
2286     }
2287     return roundAndPackFloat32(zSign, zExp, zSig, status);
2288 
2289 }
2290 
2291 /*----------------------------------------------------------------------------
2292 | Returns the result of dividing the single-precision floating-point value `a'
2293 | by the corresponding value `b'.  The operation is performed according to the
2294 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2295 *----------------------------------------------------------------------------*/
2296 
2297 float32 float32_div(float32 a, float32 b, float_status *status)
2298 {
2299     flag aSign, bSign, zSign;
2300     int aExp, bExp, zExp;
2301     uint32_t aSig, bSig, zSig;
2302     a = float32_squash_input_denormal(a, status);
2303     b = float32_squash_input_denormal(b, status);
2304 
2305     aSig = extractFloat32Frac( a );
2306     aExp = extractFloat32Exp( a );
2307     aSign = extractFloat32Sign( a );
2308     bSig = extractFloat32Frac( b );
2309     bExp = extractFloat32Exp( b );
2310     bSign = extractFloat32Sign( b );
2311     zSign = aSign ^ bSign;
2312     if ( aExp == 0xFF ) {
2313         if (aSig) {
2314             return propagateFloat32NaN(a, b, status);
2315         }
2316         if ( bExp == 0xFF ) {
2317             if (bSig) {
2318                 return propagateFloat32NaN(a, b, status);
2319             }
2320             float_raise(float_flag_invalid, status);
2321             return float32_default_nan(status);
2322         }
2323         return packFloat32( zSign, 0xFF, 0 );
2324     }
2325     if ( bExp == 0xFF ) {
2326         if (bSig) {
2327             return propagateFloat32NaN(a, b, status);
2328         }
2329         return packFloat32( zSign, 0, 0 );
2330     }
2331     if ( bExp == 0 ) {
2332         if ( bSig == 0 ) {
2333             if ( ( aExp | aSig ) == 0 ) {
2334                 float_raise(float_flag_invalid, status);
2335                 return float32_default_nan(status);
2336             }
2337             float_raise(float_flag_divbyzero, status);
2338             return packFloat32( zSign, 0xFF, 0 );
2339         }
2340         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2341     }
2342     if ( aExp == 0 ) {
2343         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2344         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2345     }
2346     zExp = aExp - bExp + 0x7D;
2347     aSig = ( aSig | 0x00800000 )<<7;
2348     bSig = ( bSig | 0x00800000 )<<8;
2349     if ( bSig <= ( aSig + aSig ) ) {
2350         aSig >>= 1;
2351         ++zExp;
2352     }
2353     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2354     if ( ( zSig & 0x3F ) == 0 ) {
2355         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2356     }
2357     return roundAndPackFloat32(zSign, zExp, zSig, status);
2358 
2359 }
2360 
2361 /*----------------------------------------------------------------------------
2362 | Returns the remainder of the single-precision floating-point value `a'
2363 | with respect to the corresponding value `b'.  The operation is performed
2364 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2365 *----------------------------------------------------------------------------*/
2366 
2367 float32 float32_rem(float32 a, float32 b, float_status *status)
2368 {
2369     flag aSign, zSign;
2370     int aExp, bExp, expDiff;
2371     uint32_t aSig, bSig;
2372     uint32_t q;
2373     uint64_t aSig64, bSig64, q64;
2374     uint32_t alternateASig;
2375     int32_t sigMean;
2376     a = float32_squash_input_denormal(a, status);
2377     b = float32_squash_input_denormal(b, status);
2378 
2379     aSig = extractFloat32Frac( a );
2380     aExp = extractFloat32Exp( a );
2381     aSign = extractFloat32Sign( a );
2382     bSig = extractFloat32Frac( b );
2383     bExp = extractFloat32Exp( b );
2384     if ( aExp == 0xFF ) {
2385         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2386             return propagateFloat32NaN(a, b, status);
2387         }
2388         float_raise(float_flag_invalid, status);
2389         return float32_default_nan(status);
2390     }
2391     if ( bExp == 0xFF ) {
2392         if (bSig) {
2393             return propagateFloat32NaN(a, b, status);
2394         }
2395         return a;
2396     }
2397     if ( bExp == 0 ) {
2398         if ( bSig == 0 ) {
2399             float_raise(float_flag_invalid, status);
2400             return float32_default_nan(status);
2401         }
2402         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2403     }
2404     if ( aExp == 0 ) {
2405         if ( aSig == 0 ) return a;
2406         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2407     }
2408     expDiff = aExp - bExp;
2409     aSig |= 0x00800000;
2410     bSig |= 0x00800000;
2411     if ( expDiff < 32 ) {
2412         aSig <<= 8;
2413         bSig <<= 8;
2414         if ( expDiff < 0 ) {
2415             if ( expDiff < -1 ) return a;
2416             aSig >>= 1;
2417         }
2418         q = ( bSig <= aSig );
2419         if ( q ) aSig -= bSig;
2420         if ( 0 < expDiff ) {
2421             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2422             q >>= 32 - expDiff;
2423             bSig >>= 2;
2424             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2425         }
2426         else {
2427             aSig >>= 2;
2428             bSig >>= 2;
2429         }
2430     }
2431     else {
2432         if ( bSig <= aSig ) aSig -= bSig;
2433         aSig64 = ( (uint64_t) aSig )<<40;
2434         bSig64 = ( (uint64_t) bSig )<<40;
2435         expDiff -= 64;
2436         while ( 0 < expDiff ) {
2437             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2438             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2439             aSig64 = - ( ( bSig * q64 )<<38 );
2440             expDiff -= 62;
2441         }
2442         expDiff += 64;
2443         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2444         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2445         q = q64>>( 64 - expDiff );
2446         bSig <<= 6;
2447         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2448     }
2449     do {
2450         alternateASig = aSig;
2451         ++q;
2452         aSig -= bSig;
2453     } while ( 0 <= (int32_t) aSig );
2454     sigMean = aSig + alternateASig;
2455     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2456         aSig = alternateASig;
2457     }
2458     zSign = ( (int32_t) aSig < 0 );
2459     if ( zSign ) aSig = - aSig;
2460     return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2461 }
2462 
2463 /*----------------------------------------------------------------------------
2464 | Returns the result of multiplying the single-precision floating-point values
2465 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2466 | multiplication.  The operation is performed according to the IEC/IEEE
2467 | Standard for Binary Floating-Point Arithmetic 754-2008.
2468 | The flags argument allows the caller to select negation of the
2469 | addend, the intermediate product, or the final result. (The difference
2470 | between this and having the caller do a separate negation is that negating
2471 | externally will flip the sign bit on NaNs.)
2472 *----------------------------------------------------------------------------*/
2473 
2474 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2475                        float_status *status)
2476 {
2477     flag aSign, bSign, cSign, zSign;
2478     int aExp, bExp, cExp, pExp, zExp, expDiff;
2479     uint32_t aSig, bSig, cSig;
2480     flag pInf, pZero, pSign;
2481     uint64_t pSig64, cSig64, zSig64;
2482     uint32_t pSig;
2483     int shiftcount;
2484     flag signflip, infzero;
2485 
2486     a = float32_squash_input_denormal(a, status);
2487     b = float32_squash_input_denormal(b, status);
2488     c = float32_squash_input_denormal(c, status);
2489     aSig = extractFloat32Frac(a);
2490     aExp = extractFloat32Exp(a);
2491     aSign = extractFloat32Sign(a);
2492     bSig = extractFloat32Frac(b);
2493     bExp = extractFloat32Exp(b);
2494     bSign = extractFloat32Sign(b);
2495     cSig = extractFloat32Frac(c);
2496     cExp = extractFloat32Exp(c);
2497     cSign = extractFloat32Sign(c);
2498 
2499     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2500                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2501 
2502     /* It is implementation-defined whether the cases of (0,inf,qnan)
2503      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2504      * they return if they do), so we have to hand this information
2505      * off to the target-specific pick-a-NaN routine.
2506      */
2507     if (((aExp == 0xff) && aSig) ||
2508         ((bExp == 0xff) && bSig) ||
2509         ((cExp == 0xff) && cSig)) {
2510         return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2511     }
2512 
2513     if (infzero) {
2514         float_raise(float_flag_invalid, status);
2515         return float32_default_nan(status);
2516     }
2517 
2518     if (flags & float_muladd_negate_c) {
2519         cSign ^= 1;
2520     }
2521 
2522     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2523 
2524     /* Work out the sign and type of the product */
2525     pSign = aSign ^ bSign;
2526     if (flags & float_muladd_negate_product) {
2527         pSign ^= 1;
2528     }
2529     pInf = (aExp == 0xff) || (bExp == 0xff);
2530     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2531 
2532     if (cExp == 0xff) {
2533         if (pInf && (pSign ^ cSign)) {
2534             /* addition of opposite-signed infinities => InvalidOperation */
2535             float_raise(float_flag_invalid, status);
2536             return float32_default_nan(status);
2537         }
2538         /* Otherwise generate an infinity of the same sign */
2539         return packFloat32(cSign ^ signflip, 0xff, 0);
2540     }
2541 
2542     if (pInf) {
2543         return packFloat32(pSign ^ signflip, 0xff, 0);
2544     }
2545 
2546     if (pZero) {
2547         if (cExp == 0) {
2548             if (cSig == 0) {
2549                 /* Adding two exact zeroes */
2550                 if (pSign == cSign) {
2551                     zSign = pSign;
2552                 } else if (status->float_rounding_mode == float_round_down) {
2553                     zSign = 1;
2554                 } else {
2555                     zSign = 0;
2556                 }
2557                 return packFloat32(zSign ^ signflip, 0, 0);
2558             }
2559             /* Exact zero plus a denorm */
2560             if (status->flush_to_zero) {
2561                 float_raise(float_flag_output_denormal, status);
2562                 return packFloat32(cSign ^ signflip, 0, 0);
2563             }
2564         }
2565         /* Zero plus something non-zero : just return the something */
2566         if (flags & float_muladd_halve_result) {
2567             if (cExp == 0) {
2568                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2569             }
2570             /* Subtract one to halve, and one again because roundAndPackFloat32
2571              * wants one less than the true exponent.
2572              */
2573             cExp -= 2;
2574             cSig = (cSig | 0x00800000) << 7;
2575             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2576         }
2577         return packFloat32(cSign ^ signflip, cExp, cSig);
2578     }
2579 
2580     if (aExp == 0) {
2581         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2582     }
2583     if (bExp == 0) {
2584         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2585     }
2586 
2587     /* Calculate the actual result a * b + c */
2588 
2589     /* Multiply first; this is easy. */
2590     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2591      * because we want the true exponent, not the "one-less-than"
2592      * flavour that roundAndPackFloat32() takes.
2593      */
2594     pExp = aExp + bExp - 0x7e;
2595     aSig = (aSig | 0x00800000) << 7;
2596     bSig = (bSig | 0x00800000) << 8;
2597     pSig64 = (uint64_t)aSig * bSig;
2598     if ((int64_t)(pSig64 << 1) >= 0) {
2599         pSig64 <<= 1;
2600         pExp--;
2601     }
2602 
2603     zSign = pSign ^ signflip;
2604 
2605     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2606      * position 62.
2607      */
2608     if (cExp == 0) {
2609         if (!cSig) {
2610             /* Throw out the special case of c being an exact zero now */
2611             shift64RightJamming(pSig64, 32, &pSig64);
2612             pSig = pSig64;
2613             if (flags & float_muladd_halve_result) {
2614                 pExp--;
2615             }
2616             return roundAndPackFloat32(zSign, pExp - 1,
2617                                        pSig, status);
2618         }
2619         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2620     }
2621 
2622     cSig64 = (uint64_t)cSig << (62 - 23);
2623     cSig64 |= LIT64(0x4000000000000000);
2624     expDiff = pExp - cExp;
2625 
2626     if (pSign == cSign) {
2627         /* Addition */
2628         if (expDiff > 0) {
2629             /* scale c to match p */
2630             shift64RightJamming(cSig64, expDiff, &cSig64);
2631             zExp = pExp;
2632         } else if (expDiff < 0) {
2633             /* scale p to match c */
2634             shift64RightJamming(pSig64, -expDiff, &pSig64);
2635             zExp = cExp;
2636         } else {
2637             /* no scaling needed */
2638             zExp = cExp;
2639         }
2640         /* Add significands and make sure explicit bit ends up in posn 62 */
2641         zSig64 = pSig64 + cSig64;
2642         if ((int64_t)zSig64 < 0) {
2643             shift64RightJamming(zSig64, 1, &zSig64);
2644         } else {
2645             zExp--;
2646         }
2647     } else {
2648         /* Subtraction */
2649         if (expDiff > 0) {
2650             shift64RightJamming(cSig64, expDiff, &cSig64);
2651             zSig64 = pSig64 - cSig64;
2652             zExp = pExp;
2653         } else if (expDiff < 0) {
2654             shift64RightJamming(pSig64, -expDiff, &pSig64);
2655             zSig64 = cSig64 - pSig64;
2656             zExp = cExp;
2657             zSign ^= 1;
2658         } else {
2659             zExp = pExp;
2660             if (cSig64 < pSig64) {
2661                 zSig64 = pSig64 - cSig64;
2662             } else if (pSig64 < cSig64) {
2663                 zSig64 = cSig64 - pSig64;
2664                 zSign ^= 1;
2665             } else {
2666                 /* Exact zero */
2667                 zSign = signflip;
2668                 if (status->float_rounding_mode == float_round_down) {
2669                     zSign ^= 1;
2670                 }
2671                 return packFloat32(zSign, 0, 0);
2672             }
2673         }
2674         --zExp;
2675         /* Normalize to put the explicit bit back into bit 62. */
2676         shiftcount = countLeadingZeros64(zSig64) - 1;
2677         zSig64 <<= shiftcount;
2678         zExp -= shiftcount;
2679     }
2680     if (flags & float_muladd_halve_result) {
2681         zExp--;
2682     }
2683 
2684     shift64RightJamming(zSig64, 32, &zSig64);
2685     return roundAndPackFloat32(zSign, zExp, zSig64, status);
2686 }
2687 
2688 
2689 /*----------------------------------------------------------------------------
2690 | Returns the square root of the single-precision floating-point value `a'.
2691 | The operation is performed according to the IEC/IEEE Standard for Binary
2692 | Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694 
2695 float32 float32_sqrt(float32 a, float_status *status)
2696 {
2697     flag aSign;
2698     int aExp, zExp;
2699     uint32_t aSig, zSig;
2700     uint64_t rem, term;
2701     a = float32_squash_input_denormal(a, status);
2702 
2703     aSig = extractFloat32Frac( a );
2704     aExp = extractFloat32Exp( a );
2705     aSign = extractFloat32Sign( a );
2706     if ( aExp == 0xFF ) {
2707         if (aSig) {
2708             return propagateFloat32NaN(a, float32_zero, status);
2709         }
2710         if ( ! aSign ) return a;
2711         float_raise(float_flag_invalid, status);
2712         return float32_default_nan(status);
2713     }
2714     if ( aSign ) {
2715         if ( ( aExp | aSig ) == 0 ) return a;
2716         float_raise(float_flag_invalid, status);
2717         return float32_default_nan(status);
2718     }
2719     if ( aExp == 0 ) {
2720         if ( aSig == 0 ) return float32_zero;
2721         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2722     }
2723     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2724     aSig = ( aSig | 0x00800000 )<<8;
2725     zSig = estimateSqrt32( aExp, aSig ) + 2;
2726     if ( ( zSig & 0x7F ) <= 5 ) {
2727         if ( zSig < 2 ) {
2728             zSig = 0x7FFFFFFF;
2729             goto roundAndPack;
2730         }
2731         aSig >>= aExp & 1;
2732         term = ( (uint64_t) zSig ) * zSig;
2733         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2734         while ( (int64_t) rem < 0 ) {
2735             --zSig;
2736             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2737         }
2738         zSig |= ( rem != 0 );
2739     }
2740     shift32RightJamming( zSig, 1, &zSig );
2741  roundAndPack:
2742     return roundAndPackFloat32(0, zExp, zSig, status);
2743 
2744 }
2745 
2746 /*----------------------------------------------------------------------------
2747 | Returns the binary exponential of the single-precision floating-point value
2748 | `a'. The operation is performed according to the IEC/IEEE Standard for
2749 | Binary Floating-Point Arithmetic.
2750 |
2751 | Uses the following identities:
2752 |
2753 | 1. -------------------------------------------------------------------------
2754 |      x    x*ln(2)
2755 |     2  = e
2756 |
2757 | 2. -------------------------------------------------------------------------
2758 |                      2     3     4     5           n
2759 |      x        x     x     x     x     x           x
2760 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2761 |               1!    2!    3!    4!    5!          n!
2762 *----------------------------------------------------------------------------*/
2763 
2764 static const float64 float32_exp2_coefficients[15] =
2765 {
2766     const_float64( 0x3ff0000000000000ll ), /*  1 */
2767     const_float64( 0x3fe0000000000000ll ), /*  2 */
2768     const_float64( 0x3fc5555555555555ll ), /*  3 */
2769     const_float64( 0x3fa5555555555555ll ), /*  4 */
2770     const_float64( 0x3f81111111111111ll ), /*  5 */
2771     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2772     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2773     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2774     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2775     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2776     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2777     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2778     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2779     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2780     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2781 };
2782 
2783 float32 float32_exp2(float32 a, float_status *status)
2784 {
2785     flag aSign;
2786     int aExp;
2787     uint32_t aSig;
2788     float64 r, x, xn;
2789     int i;
2790     a = float32_squash_input_denormal(a, status);
2791 
2792     aSig = extractFloat32Frac( a );
2793     aExp = extractFloat32Exp( a );
2794     aSign = extractFloat32Sign( a );
2795 
2796     if ( aExp == 0xFF) {
2797         if (aSig) {
2798             return propagateFloat32NaN(a, float32_zero, status);
2799         }
2800         return (aSign) ? float32_zero : a;
2801     }
2802     if (aExp == 0) {
2803         if (aSig == 0) return float32_one;
2804     }
2805 
2806     float_raise(float_flag_inexact, status);
2807 
2808     /* ******************************* */
2809     /* using float64 for approximation */
2810     /* ******************************* */
2811     x = float32_to_float64(a, status);
2812     x = float64_mul(x, float64_ln2, status);
2813 
2814     xn = x;
2815     r = float64_one;
2816     for (i = 0 ; i < 15 ; i++) {
2817         float64 f;
2818 
2819         f = float64_mul(xn, float32_exp2_coefficients[i], status);
2820         r = float64_add(r, f, status);
2821 
2822         xn = float64_mul(xn, x, status);
2823     }
2824 
2825     return float64_to_float32(r, status);
2826 }
2827 
2828 /*----------------------------------------------------------------------------
2829 | Returns the binary log of the single-precision floating-point value `a'.
2830 | The operation is performed according to the IEC/IEEE Standard for Binary
2831 | Floating-Point Arithmetic.
2832 *----------------------------------------------------------------------------*/
2833 float32 float32_log2(float32 a, float_status *status)
2834 {
2835     flag aSign, zSign;
2836     int aExp;
2837     uint32_t aSig, zSig, i;
2838 
2839     a = float32_squash_input_denormal(a, status);
2840     aSig = extractFloat32Frac( a );
2841     aExp = extractFloat32Exp( a );
2842     aSign = extractFloat32Sign( a );
2843 
2844     if ( aExp == 0 ) {
2845         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2846         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2847     }
2848     if ( aSign ) {
2849         float_raise(float_flag_invalid, status);
2850         return float32_default_nan(status);
2851     }
2852     if ( aExp == 0xFF ) {
2853         if (aSig) {
2854             return propagateFloat32NaN(a, float32_zero, status);
2855         }
2856         return a;
2857     }
2858 
2859     aExp -= 0x7F;
2860     aSig |= 0x00800000;
2861     zSign = aExp < 0;
2862     zSig = aExp << 23;
2863 
2864     for (i = 1 << 22; i > 0; i >>= 1) {
2865         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2866         if ( aSig & 0x01000000 ) {
2867             aSig >>= 1;
2868             zSig |= i;
2869         }
2870     }
2871 
2872     if ( zSign )
2873         zSig = -zSig;
2874 
2875     return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2876 }
2877 
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is equal to
2880 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2881 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2884 
2885 int float32_eq(float32 a, float32 b, float_status *status)
2886 {
2887     uint32_t av, bv;
2888     a = float32_squash_input_denormal(a, status);
2889     b = float32_squash_input_denormal(b, status);
2890 
2891     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2892          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2893        ) {
2894         float_raise(float_flag_invalid, status);
2895         return 0;
2896     }
2897     av = float32_val(a);
2898     bv = float32_val(b);
2899     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2900 }
2901 
2902 /*----------------------------------------------------------------------------
2903 | Returns 1 if the single-precision floating-point value `a' is less than
2904 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2905 | exception is raised if either operand is a NaN.  The comparison is performed
2906 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907 *----------------------------------------------------------------------------*/
2908 
2909 int float32_le(float32 a, float32 b, float_status *status)
2910 {
2911     flag aSign, bSign;
2912     uint32_t av, bv;
2913     a = float32_squash_input_denormal(a, status);
2914     b = float32_squash_input_denormal(b, status);
2915 
2916     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918        ) {
2919         float_raise(float_flag_invalid, status);
2920         return 0;
2921     }
2922     aSign = extractFloat32Sign( a );
2923     bSign = extractFloat32Sign( b );
2924     av = float32_val(a);
2925     bv = float32_val(b);
2926     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2927     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2928 
2929 }
2930 
2931 /*----------------------------------------------------------------------------
2932 | Returns 1 if the single-precision floating-point value `a' is less than
2933 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2934 | raised if either operand is a NaN.  The comparison is performed according
2935 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2936 *----------------------------------------------------------------------------*/
2937 
2938 int float32_lt(float32 a, float32 b, float_status *status)
2939 {
2940     flag aSign, bSign;
2941     uint32_t av, bv;
2942     a = float32_squash_input_denormal(a, status);
2943     b = float32_squash_input_denormal(b, status);
2944 
2945     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2946          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2947        ) {
2948         float_raise(float_flag_invalid, status);
2949         return 0;
2950     }
2951     aSign = extractFloat32Sign( a );
2952     bSign = extractFloat32Sign( b );
2953     av = float32_val(a);
2954     bv = float32_val(b);
2955     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2956     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2957 
2958 }
2959 
2960 /*----------------------------------------------------------------------------
2961 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2962 | be compared, and 0 otherwise.  The invalid exception is raised if either
2963 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2964 | Standard for Binary Floating-Point Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966 
2967 int float32_unordered(float32 a, float32 b, float_status *status)
2968 {
2969     a = float32_squash_input_denormal(a, status);
2970     b = float32_squash_input_denormal(b, status);
2971 
2972     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2973          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2974        ) {
2975         float_raise(float_flag_invalid, status);
2976         return 1;
2977     }
2978     return 0;
2979 }
2980 
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is equal to
2983 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2984 | exception.  The comparison is performed according to the IEC/IEEE Standard
2985 | for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2987 
2988 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2989 {
2990     a = float32_squash_input_denormal(a, status);
2991     b = float32_squash_input_denormal(b, status);
2992 
2993     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2994          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2995        ) {
2996         if (float32_is_signaling_nan(a, status)
2997          || float32_is_signaling_nan(b, status)) {
2998             float_raise(float_flag_invalid, status);
2999         }
3000         return 0;
3001     }
3002     return ( float32_val(a) == float32_val(b) ) ||
3003             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
3004 }
3005 
3006 /*----------------------------------------------------------------------------
3007 | Returns 1 if the single-precision floating-point value `a' is less than or
3008 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3009 | cause an exception.  Otherwise, the comparison is performed according to the
3010 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3011 *----------------------------------------------------------------------------*/
3012 
3013 int float32_le_quiet(float32 a, float32 b, float_status *status)
3014 {
3015     flag aSign, bSign;
3016     uint32_t av, bv;
3017     a = float32_squash_input_denormal(a, status);
3018     b = float32_squash_input_denormal(b, status);
3019 
3020     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3021          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3022        ) {
3023         if (float32_is_signaling_nan(a, status)
3024          || float32_is_signaling_nan(b, status)) {
3025             float_raise(float_flag_invalid, status);
3026         }
3027         return 0;
3028     }
3029     aSign = extractFloat32Sign( a );
3030     bSign = extractFloat32Sign( b );
3031     av = float32_val(a);
3032     bv = float32_val(b);
3033     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3034     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3035 
3036 }
3037 
3038 /*----------------------------------------------------------------------------
3039 | Returns 1 if the single-precision floating-point value `a' is less than
3040 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3041 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3042 | Standard for Binary Floating-Point Arithmetic.
3043 *----------------------------------------------------------------------------*/
3044 
3045 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3046 {
3047     flag aSign, bSign;
3048     uint32_t av, bv;
3049     a = float32_squash_input_denormal(a, status);
3050     b = float32_squash_input_denormal(b, status);
3051 
3052     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3053          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3054        ) {
3055         if (float32_is_signaling_nan(a, status)
3056          || float32_is_signaling_nan(b, status)) {
3057             float_raise(float_flag_invalid, status);
3058         }
3059         return 0;
3060     }
3061     aSign = extractFloat32Sign( a );
3062     bSign = extractFloat32Sign( b );
3063     av = float32_val(a);
3064     bv = float32_val(b);
3065     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3066     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3067 
3068 }
3069 
3070 /*----------------------------------------------------------------------------
3071 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3072 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3073 | comparison is performed according to the IEC/IEEE Standard for Binary
3074 | Floating-Point Arithmetic.
3075 *----------------------------------------------------------------------------*/
3076 
3077 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3078 {
3079     a = float32_squash_input_denormal(a, status);
3080     b = float32_squash_input_denormal(b, status);
3081 
3082     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3083          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3084        ) {
3085         if (float32_is_signaling_nan(a, status)
3086          || float32_is_signaling_nan(b, status)) {
3087             float_raise(float_flag_invalid, status);
3088         }
3089         return 1;
3090     }
3091     return 0;
3092 }
3093 
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format.  The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic---which means in particular that the conversion is rounded
3099 | according to the current rounding mode.  If `a' is a NaN, the largest
3100 | positive integer is returned.  Otherwise, if the conversion overflows, the
3101 | largest integer with the same sign as `a' is returned.
3102 *----------------------------------------------------------------------------*/
3103 
3104 int32_t float64_to_int32(float64 a, float_status *status)
3105 {
3106     flag aSign;
3107     int aExp;
3108     int shiftCount;
3109     uint64_t aSig;
3110     a = float64_squash_input_denormal(a, status);
3111 
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3116     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3117     shiftCount = 0x42C - aExp;
3118     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3119     return roundAndPackInt32(aSign, aSig, status);
3120 
3121 }
3122 
3123 /*----------------------------------------------------------------------------
3124 | Returns the result of converting the double-precision floating-point value
3125 | `a' to the 32-bit two's complement integer format.  The conversion is
3126 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3127 | Arithmetic, except that the conversion is always rounded toward zero.
3128 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3129 | the conversion overflows, the largest integer with the same sign as `a' is
3130 | returned.
3131 *----------------------------------------------------------------------------*/
3132 
3133 int32_t float64_to_int32_round_to_zero(float64 a, float_status *status)
3134 {
3135     flag aSign;
3136     int aExp;
3137     int shiftCount;
3138     uint64_t aSig, savedASig;
3139     int32_t z;
3140     a = float64_squash_input_denormal(a, status);
3141 
3142     aSig = extractFloat64Frac( a );
3143     aExp = extractFloat64Exp( a );
3144     aSign = extractFloat64Sign( a );
3145     if ( 0x41E < aExp ) {
3146         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3147         goto invalid;
3148     }
3149     else if ( aExp < 0x3FF ) {
3150         if (aExp || aSig) {
3151             status->float_exception_flags |= float_flag_inexact;
3152         }
3153         return 0;
3154     }
3155     aSig |= LIT64( 0x0010000000000000 );
3156     shiftCount = 0x433 - aExp;
3157     savedASig = aSig;
3158     aSig >>= shiftCount;
3159     z = aSig;
3160     if ( aSign ) z = - z;
3161     if ( ( z < 0 ) ^ aSign ) {
3162  invalid:
3163         float_raise(float_flag_invalid, status);
3164         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3165     }
3166     if ( ( aSig<<shiftCount ) != savedASig ) {
3167         status->float_exception_flags |= float_flag_inexact;
3168     }
3169     return z;
3170 
3171 }
3172 
3173 /*----------------------------------------------------------------------------
3174 | Returns the result of converting the double-precision floating-point value
3175 | `a' to the 16-bit two's complement integer format.  The conversion is
3176 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3177 | Arithmetic, except that the conversion is always rounded toward zero.
3178 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3179 | the conversion overflows, the largest integer with the same sign as `a' is
3180 | returned.
3181 *----------------------------------------------------------------------------*/
3182 
3183 int16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3184 {
3185     flag aSign;
3186     int aExp;
3187     int shiftCount;
3188     uint64_t aSig, savedASig;
3189     int32_t z;
3190 
3191     aSig = extractFloat64Frac( a );
3192     aExp = extractFloat64Exp( a );
3193     aSign = extractFloat64Sign( a );
3194     if ( 0x40E < aExp ) {
3195         if ( ( aExp == 0x7FF ) && aSig ) {
3196             aSign = 0;
3197         }
3198         goto invalid;
3199     }
3200     else if ( aExp < 0x3FF ) {
3201         if ( aExp || aSig ) {
3202             status->float_exception_flags |= float_flag_inexact;
3203         }
3204         return 0;
3205     }
3206     aSig |= LIT64( 0x0010000000000000 );
3207     shiftCount = 0x433 - aExp;
3208     savedASig = aSig;
3209     aSig >>= shiftCount;
3210     z = aSig;
3211     if ( aSign ) {
3212         z = - z;
3213     }
3214     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3215  invalid:
3216         float_raise(float_flag_invalid, status);
3217         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3218     }
3219     if ( ( aSig<<shiftCount ) != savedASig ) {
3220         status->float_exception_flags |= float_flag_inexact;
3221     }
3222     return z;
3223 }
3224 
3225 /*----------------------------------------------------------------------------
3226 | Returns the result of converting the double-precision floating-point value
3227 | `a' to the 64-bit two's complement integer format.  The conversion is
3228 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3229 | Arithmetic---which means in particular that the conversion is rounded
3230 | according to the current rounding mode.  If `a' is a NaN, the largest
3231 | positive integer is returned.  Otherwise, if the conversion overflows, the
3232 | largest integer with the same sign as `a' is returned.
3233 *----------------------------------------------------------------------------*/
3234 
3235 int64_t float64_to_int64(float64 a, float_status *status)
3236 {
3237     flag aSign;
3238     int aExp;
3239     int shiftCount;
3240     uint64_t aSig, aSigExtra;
3241     a = float64_squash_input_denormal(a, status);
3242 
3243     aSig = extractFloat64Frac( a );
3244     aExp = extractFloat64Exp( a );
3245     aSign = extractFloat64Sign( a );
3246     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3247     shiftCount = 0x433 - aExp;
3248     if ( shiftCount <= 0 ) {
3249         if ( 0x43E < aExp ) {
3250             float_raise(float_flag_invalid, status);
3251             if (    ! aSign
3252                  || (    ( aExp == 0x7FF )
3253                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3254                ) {
3255                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3256             }
3257             return (int64_t) LIT64( 0x8000000000000000 );
3258         }
3259         aSigExtra = 0;
3260         aSig <<= - shiftCount;
3261     }
3262     else {
3263         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3264     }
3265     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3266 
3267 }
3268 
3269 /*----------------------------------------------------------------------------
3270 | Returns the result of converting the double-precision floating-point value
3271 | `a' to the 64-bit two's complement integer format.  The conversion is
3272 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3273 | Arithmetic, except that the conversion is always rounded toward zero.
3274 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3275 | the conversion overflows, the largest integer with the same sign as `a' is
3276 | returned.
3277 *----------------------------------------------------------------------------*/
3278 
3279 int64_t float64_to_int64_round_to_zero(float64 a, float_status *status)
3280 {
3281     flag aSign;
3282     int aExp;
3283     int shiftCount;
3284     uint64_t aSig;
3285     int64_t z;
3286     a = float64_squash_input_denormal(a, status);
3287 
3288     aSig = extractFloat64Frac( a );
3289     aExp = extractFloat64Exp( a );
3290     aSign = extractFloat64Sign( a );
3291     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3292     shiftCount = aExp - 0x433;
3293     if ( 0 <= shiftCount ) {
3294         if ( 0x43E <= aExp ) {
3295             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3296                 float_raise(float_flag_invalid, status);
3297                 if (    ! aSign
3298                      || (    ( aExp == 0x7FF )
3299                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3300                    ) {
3301                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3302                 }
3303             }
3304             return (int64_t) LIT64( 0x8000000000000000 );
3305         }
3306         z = aSig<<shiftCount;
3307     }
3308     else {
3309         if ( aExp < 0x3FE ) {
3310             if (aExp | aSig) {
3311                 status->float_exception_flags |= float_flag_inexact;
3312             }
3313             return 0;
3314         }
3315         z = aSig>>( - shiftCount );
3316         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3317             status->float_exception_flags |= float_flag_inexact;
3318         }
3319     }
3320     if ( aSign ) z = - z;
3321     return z;
3322 
3323 }
3324 
3325 /*----------------------------------------------------------------------------
3326 | Returns the result of converting the double-precision floating-point value
3327 | `a' to the single-precision floating-point format.  The conversion is
3328 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3329 | Arithmetic.
3330 *----------------------------------------------------------------------------*/
3331 
3332 float32 float64_to_float32(float64 a, float_status *status)
3333 {
3334     flag aSign;
3335     int aExp;
3336     uint64_t aSig;
3337     uint32_t zSig;
3338     a = float64_squash_input_denormal(a, status);
3339 
3340     aSig = extractFloat64Frac( a );
3341     aExp = extractFloat64Exp( a );
3342     aSign = extractFloat64Sign( a );
3343     if ( aExp == 0x7FF ) {
3344         if (aSig) {
3345             return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3346         }
3347         return packFloat32( aSign, 0xFF, 0 );
3348     }
3349     shift64RightJamming( aSig, 22, &aSig );
3350     zSig = aSig;
3351     if ( aExp || zSig ) {
3352         zSig |= 0x40000000;
3353         aExp -= 0x381;
3354     }
3355     return roundAndPackFloat32(aSign, aExp, zSig, status);
3356 
3357 }
3358 
3359 
3360 /*----------------------------------------------------------------------------
3361 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3362 | half-precision floating-point value, returning the result.  After being
3363 | shifted into the proper positions, the three fields are simply added
3364 | together to form the result.  This means that any integer portion of `zSig'
3365 | will be added into the exponent.  Since a properly normalized significand
3366 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3367 | than the desired result exponent whenever `zSig' is a complete, normalized
3368 | significand.
3369 *----------------------------------------------------------------------------*/
3370 static float16 packFloat16(flag zSign, int zExp, uint16_t zSig)
3371 {
3372     return make_float16(
3373         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3374 }
3375 
3376 /*----------------------------------------------------------------------------
3377 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3378 | and significand `zSig', and returns the proper half-precision floating-
3379 | point value corresponding to the abstract input.  Ordinarily, the abstract
3380 | value is simply rounded and packed into the half-precision format, with
3381 | the inexact exception raised if the abstract input cannot be represented
3382 | exactly.  However, if the abstract value is too large, the overflow and
3383 | inexact exceptions are raised and an infinity or maximal finite value is
3384 | returned.  If the abstract value is too small, the input value is rounded to
3385 | a subnormal number, and the underflow and inexact exceptions are raised if
3386 | the abstract input cannot be represented exactly as a subnormal half-
3387 | precision floating-point number.
3388 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3389 | ARM-style "alternative representation", which omits the NaN and Inf
3390 | encodings in order to raise the maximum representable exponent by one.
3391 |     The input significand `zSig' has its binary point between bits 22
3392 | and 23, which is 13 bits to the left of the usual location.  This shifted
3393 | significand must be normalized or smaller.  If `zSig' is not normalized,
3394 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3395 | and it must not require rounding.  In the usual case that `zSig' is
3396 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3397 | Note the slightly odd position of the binary point in zSig compared with the
3398 | other roundAndPackFloat functions. This should probably be fixed if we
3399 | need to implement more float16 routines than just conversion.
3400 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3401 | Binary Floating-Point Arithmetic.
3402 *----------------------------------------------------------------------------*/
3403 
3404 static float16 roundAndPackFloat16(flag zSign, int zExp,
3405                                    uint32_t zSig, flag ieee,
3406                                    float_status *status)
3407 {
3408     int maxexp = ieee ? 29 : 30;
3409     uint32_t mask;
3410     uint32_t increment;
3411     bool rounding_bumps_exp;
3412     bool is_tiny = false;
3413 
3414     /* Calculate the mask of bits of the mantissa which are not
3415      * representable in half-precision and will be lost.
3416      */
3417     if (zExp < 1) {
3418         /* Will be denormal in halfprec */
3419         mask = 0x00ffffff;
3420         if (zExp >= -11) {
3421             mask >>= 11 + zExp;
3422         }
3423     } else {
3424         /* Normal number in halfprec */
3425         mask = 0x00001fff;
3426     }
3427 
3428     switch (status->float_rounding_mode) {
3429     case float_round_nearest_even:
3430         increment = (mask + 1) >> 1;
3431         if ((zSig & mask) == increment) {
3432             increment = zSig & (increment << 1);
3433         }
3434         break;
3435     case float_round_ties_away:
3436         increment = (mask + 1) >> 1;
3437         break;
3438     case float_round_up:
3439         increment = zSign ? 0 : mask;
3440         break;
3441     case float_round_down:
3442         increment = zSign ? mask : 0;
3443         break;
3444     default: /* round_to_zero */
3445         increment = 0;
3446         break;
3447     }
3448 
3449     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3450 
3451     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3452         if (ieee) {
3453             float_raise(float_flag_overflow | float_flag_inexact, status);
3454             return packFloat16(zSign, 0x1f, 0);
3455         } else {
3456             float_raise(float_flag_invalid, status);
3457             return packFloat16(zSign, 0x1f, 0x3ff);
3458         }
3459     }
3460 
3461     if (zExp < 0) {
3462         /* Note that flush-to-zero does not affect half-precision results */
3463         is_tiny =
3464             (status->float_detect_tininess == float_tininess_before_rounding)
3465             || (zExp < -1)
3466             || (!rounding_bumps_exp);
3467     }
3468     if (zSig & mask) {
3469         float_raise(float_flag_inexact, status);
3470         if (is_tiny) {
3471             float_raise(float_flag_underflow, status);
3472         }
3473     }
3474 
3475     zSig += increment;
3476     if (rounding_bumps_exp) {
3477         zSig >>= 1;
3478         zExp++;
3479     }
3480 
3481     if (zExp < -10) {
3482         return packFloat16(zSign, 0, 0);
3483     }
3484     if (zExp < 0) {
3485         zSig >>= -zExp;
3486         zExp = 0;
3487     }
3488     return packFloat16(zSign, zExp, zSig >> 13);
3489 }
3490 
3491 static void normalizeFloat16Subnormal(uint32_t aSig, int *zExpPtr,
3492                                       uint32_t *zSigPtr)
3493 {
3494     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3495     *zSigPtr = aSig << shiftCount;
3496     *zExpPtr = 1 - shiftCount;
3497 }
3498 
3499 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3500    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3501 
3502 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3503 {
3504     flag aSign;
3505     int aExp;
3506     uint32_t aSig;
3507 
3508     aSign = extractFloat16Sign(a);
3509     aExp = extractFloat16Exp(a);
3510     aSig = extractFloat16Frac(a);
3511 
3512     if (aExp == 0x1f && ieee) {
3513         if (aSig) {
3514             return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3515         }
3516         return packFloat32(aSign, 0xff, 0);
3517     }
3518     if (aExp == 0) {
3519         if (aSig == 0) {
3520             return packFloat32(aSign, 0, 0);
3521         }
3522 
3523         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3524         aExp--;
3525     }
3526     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3527 }
3528 
3529 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3530 {
3531     flag aSign;
3532     int aExp;
3533     uint32_t aSig;
3534 
3535     a = float32_squash_input_denormal(a, status);
3536 
3537     aSig = extractFloat32Frac( a );
3538     aExp = extractFloat32Exp( a );
3539     aSign = extractFloat32Sign( a );
3540     if ( aExp == 0xFF ) {
3541         if (aSig) {
3542             /* Input is a NaN */
3543             if (!ieee) {
3544                 float_raise(float_flag_invalid, status);
3545                 return packFloat16(aSign, 0, 0);
3546             }
3547             return commonNaNToFloat16(
3548                 float32ToCommonNaN(a, status), status);
3549         }
3550         /* Infinity */
3551         if (!ieee) {
3552             float_raise(float_flag_invalid, status);
3553             return packFloat16(aSign, 0x1f, 0x3ff);
3554         }
3555         return packFloat16(aSign, 0x1f, 0);
3556     }
3557     if (aExp == 0 && aSig == 0) {
3558         return packFloat16(aSign, 0, 0);
3559     }
3560     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3561      * even if the input is denormal; however this is harmless because
3562      * the largest possible single-precision denormal is still smaller
3563      * than the smallest representable half-precision denormal, and so we
3564      * will end up ignoring aSig and returning via the "always return zero"
3565      * codepath.
3566      */
3567     aSig |= 0x00800000;
3568     aExp -= 0x71;
3569 
3570     return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3571 }
3572 
3573 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3574 {
3575     flag aSign;
3576     int aExp;
3577     uint32_t aSig;
3578 
3579     aSign = extractFloat16Sign(a);
3580     aExp = extractFloat16Exp(a);
3581     aSig = extractFloat16Frac(a);
3582 
3583     if (aExp == 0x1f && ieee) {
3584         if (aSig) {
3585             return commonNaNToFloat64(
3586                 float16ToCommonNaN(a, status), status);
3587         }
3588         return packFloat64(aSign, 0x7ff, 0);
3589     }
3590     if (aExp == 0) {
3591         if (aSig == 0) {
3592             return packFloat64(aSign, 0, 0);
3593         }
3594 
3595         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3596         aExp--;
3597     }
3598     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3599 }
3600 
3601 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3602 {
3603     flag aSign;
3604     int aExp;
3605     uint64_t aSig;
3606     uint32_t zSig;
3607 
3608     a = float64_squash_input_denormal(a, status);
3609 
3610     aSig = extractFloat64Frac(a);
3611     aExp = extractFloat64Exp(a);
3612     aSign = extractFloat64Sign(a);
3613     if (aExp == 0x7FF) {
3614         if (aSig) {
3615             /* Input is a NaN */
3616             if (!ieee) {
3617                 float_raise(float_flag_invalid, status);
3618                 return packFloat16(aSign, 0, 0);
3619             }
3620             return commonNaNToFloat16(
3621                 float64ToCommonNaN(a, status), status);
3622         }
3623         /* Infinity */
3624         if (!ieee) {
3625             float_raise(float_flag_invalid, status);
3626             return packFloat16(aSign, 0x1f, 0x3ff);
3627         }
3628         return packFloat16(aSign, 0x1f, 0);
3629     }
3630     shift64RightJamming(aSig, 29, &aSig);
3631     zSig = aSig;
3632     if (aExp == 0 && zSig == 0) {
3633         return packFloat16(aSign, 0, 0);
3634     }
3635     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3636      * even if the input is denormal; however this is harmless because
3637      * the largest possible single-precision denormal is still smaller
3638      * than the smallest representable half-precision denormal, and so we
3639      * will end up ignoring aSig and returning via the "always return zero"
3640      * codepath.
3641      */
3642     zSig |= 0x00800000;
3643     aExp -= 0x3F1;
3644 
3645     return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3646 }
3647 
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the extended double-precision floating-point format.  The conversion
3651 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3652 | Arithmetic.
3653 *----------------------------------------------------------------------------*/
3654 
3655 floatx80 float64_to_floatx80(float64 a, float_status *status)
3656 {
3657     flag aSign;
3658     int aExp;
3659     uint64_t aSig;
3660 
3661     a = float64_squash_input_denormal(a, status);
3662     aSig = extractFloat64Frac( a );
3663     aExp = extractFloat64Exp( a );
3664     aSign = extractFloat64Sign( a );
3665     if ( aExp == 0x7FF ) {
3666         if (aSig) {
3667             return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3668         }
3669         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670     }
3671     if ( aExp == 0 ) {
3672         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3673         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3674     }
3675     return
3676         packFloatx80(
3677             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3678 
3679 }
3680 
3681 /*----------------------------------------------------------------------------
3682 | Returns the result of converting the double-precision floating-point value
3683 | `a' to the quadruple-precision floating-point format.  The conversion is
3684 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3685 | Arithmetic.
3686 *----------------------------------------------------------------------------*/
3687 
3688 float128 float64_to_float128(float64 a, float_status *status)
3689 {
3690     flag aSign;
3691     int aExp;
3692     uint64_t aSig, zSig0, zSig1;
3693 
3694     a = float64_squash_input_denormal(a, status);
3695     aSig = extractFloat64Frac( a );
3696     aExp = extractFloat64Exp( a );
3697     aSign = extractFloat64Sign( a );
3698     if ( aExp == 0x7FF ) {
3699         if (aSig) {
3700             return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3701         }
3702         return packFloat128( aSign, 0x7FFF, 0, 0 );
3703     }
3704     if ( aExp == 0 ) {
3705         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3706         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3707         --aExp;
3708     }
3709     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3710     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3711 
3712 }
3713 
3714 /*----------------------------------------------------------------------------
3715 | Rounds the double-precision floating-point value `a' to an integer, and
3716 | returns the result as a double-precision floating-point value.  The
3717 | operation is performed according to the IEC/IEEE Standard for Binary
3718 | Floating-Point Arithmetic.
3719 *----------------------------------------------------------------------------*/
3720 
3721 float64 float64_round_to_int(float64 a, float_status *status)
3722 {
3723     flag aSign;
3724     int aExp;
3725     uint64_t lastBitMask, roundBitsMask;
3726     uint64_t z;
3727     a = float64_squash_input_denormal(a, status);
3728 
3729     aExp = extractFloat64Exp( a );
3730     if ( 0x433 <= aExp ) {
3731         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3732             return propagateFloat64NaN(a, a, status);
3733         }
3734         return a;
3735     }
3736     if ( aExp < 0x3FF ) {
3737         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3738         status->float_exception_flags |= float_flag_inexact;
3739         aSign = extractFloat64Sign( a );
3740         switch (status->float_rounding_mode) {
3741          case float_round_nearest_even:
3742             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3743                 return packFloat64( aSign, 0x3FF, 0 );
3744             }
3745             break;
3746         case float_round_ties_away:
3747             if (aExp == 0x3FE) {
3748                 return packFloat64(aSign, 0x3ff, 0);
3749             }
3750             break;
3751          case float_round_down:
3752             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3753          case float_round_up:
3754             return make_float64(
3755             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3756         }
3757         return packFloat64( aSign, 0, 0 );
3758     }
3759     lastBitMask = 1;
3760     lastBitMask <<= 0x433 - aExp;
3761     roundBitsMask = lastBitMask - 1;
3762     z = float64_val(a);
3763     switch (status->float_rounding_mode) {
3764     case float_round_nearest_even:
3765         z += lastBitMask >> 1;
3766         if ((z & roundBitsMask) == 0) {
3767             z &= ~lastBitMask;
3768         }
3769         break;
3770     case float_round_ties_away:
3771         z += lastBitMask >> 1;
3772         break;
3773     case float_round_to_zero:
3774         break;
3775     case float_round_up:
3776         if (!extractFloat64Sign(make_float64(z))) {
3777             z += roundBitsMask;
3778         }
3779         break;
3780     case float_round_down:
3781         if (extractFloat64Sign(make_float64(z))) {
3782             z += roundBitsMask;
3783         }
3784         break;
3785     default:
3786         abort();
3787     }
3788     z &= ~ roundBitsMask;
3789     if (z != float64_val(a)) {
3790         status->float_exception_flags |= float_flag_inexact;
3791     }
3792     return make_float64(z);
3793 
3794 }
3795 
3796 float64 float64_trunc_to_int(float64 a, float_status *status)
3797 {
3798     int oldmode;
3799     float64 res;
3800     oldmode = status->float_rounding_mode;
3801     status->float_rounding_mode = float_round_to_zero;
3802     res = float64_round_to_int(a, status);
3803     status->float_rounding_mode = oldmode;
3804     return res;
3805 }
3806 
3807 /*----------------------------------------------------------------------------
3808 | Returns the result of adding the absolute values of the double-precision
3809 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3810 | before being returned.  `zSign' is ignored if the result is a NaN.
3811 | The addition is performed according to the IEC/IEEE Standard for Binary
3812 | Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3814 
3815 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3816                               float_status *status)
3817 {
3818     int aExp, bExp, zExp;
3819     uint64_t aSig, bSig, zSig;
3820     int expDiff;
3821 
3822     aSig = extractFloat64Frac( a );
3823     aExp = extractFloat64Exp( a );
3824     bSig = extractFloat64Frac( b );
3825     bExp = extractFloat64Exp( b );
3826     expDiff = aExp - bExp;
3827     aSig <<= 9;
3828     bSig <<= 9;
3829     if ( 0 < expDiff ) {
3830         if ( aExp == 0x7FF ) {
3831             if (aSig) {
3832                 return propagateFloat64NaN(a, b, status);
3833             }
3834             return a;
3835         }
3836         if ( bExp == 0 ) {
3837             --expDiff;
3838         }
3839         else {
3840             bSig |= LIT64( 0x2000000000000000 );
3841         }
3842         shift64RightJamming( bSig, expDiff, &bSig );
3843         zExp = aExp;
3844     }
3845     else if ( expDiff < 0 ) {
3846         if ( bExp == 0x7FF ) {
3847             if (bSig) {
3848                 return propagateFloat64NaN(a, b, status);
3849             }
3850             return packFloat64( zSign, 0x7FF, 0 );
3851         }
3852         if ( aExp == 0 ) {
3853             ++expDiff;
3854         }
3855         else {
3856             aSig |= LIT64( 0x2000000000000000 );
3857         }
3858         shift64RightJamming( aSig, - expDiff, &aSig );
3859         zExp = bExp;
3860     }
3861     else {
3862         if ( aExp == 0x7FF ) {
3863             if (aSig | bSig) {
3864                 return propagateFloat64NaN(a, b, status);
3865             }
3866             return a;
3867         }
3868         if ( aExp == 0 ) {
3869             if (status->flush_to_zero) {
3870                 if (aSig | bSig) {
3871                     float_raise(float_flag_output_denormal, status);
3872                 }
3873                 return packFloat64(zSign, 0, 0);
3874             }
3875             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3876         }
3877         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3878         zExp = aExp;
3879         goto roundAndPack;
3880     }
3881     aSig |= LIT64( 0x2000000000000000 );
3882     zSig = ( aSig + bSig )<<1;
3883     --zExp;
3884     if ( (int64_t) zSig < 0 ) {
3885         zSig = aSig + bSig;
3886         ++zExp;
3887     }
3888  roundAndPack:
3889     return roundAndPackFloat64(zSign, zExp, zSig, status);
3890 
3891 }
3892 
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of subtracting the absolute values of the double-
3895 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3896 | difference is negated before being returned.  `zSign' is ignored if the
3897 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3898 | Standard for Binary Floating-Point Arithmetic.
3899 *----------------------------------------------------------------------------*/
3900 
3901 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3902                               float_status *status)
3903 {
3904     int aExp, bExp, zExp;
3905     uint64_t aSig, bSig, zSig;
3906     int expDiff;
3907 
3908     aSig = extractFloat64Frac( a );
3909     aExp = extractFloat64Exp( a );
3910     bSig = extractFloat64Frac( b );
3911     bExp = extractFloat64Exp( b );
3912     expDiff = aExp - bExp;
3913     aSig <<= 10;
3914     bSig <<= 10;
3915     if ( 0 < expDiff ) goto aExpBigger;
3916     if ( expDiff < 0 ) goto bExpBigger;
3917     if ( aExp == 0x7FF ) {
3918         if (aSig | bSig) {
3919             return propagateFloat64NaN(a, b, status);
3920         }
3921         float_raise(float_flag_invalid, status);
3922         return float64_default_nan(status);
3923     }
3924     if ( aExp == 0 ) {
3925         aExp = 1;
3926         bExp = 1;
3927     }
3928     if ( bSig < aSig ) goto aBigger;
3929     if ( aSig < bSig ) goto bBigger;
3930     return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3931  bExpBigger:
3932     if ( bExp == 0x7FF ) {
3933         if (bSig) {
3934             return propagateFloat64NaN(a, b, status);
3935         }
3936         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3937     }
3938     if ( aExp == 0 ) {
3939         ++expDiff;
3940     }
3941     else {
3942         aSig |= LIT64( 0x4000000000000000 );
3943     }
3944     shift64RightJamming( aSig, - expDiff, &aSig );
3945     bSig |= LIT64( 0x4000000000000000 );
3946  bBigger:
3947     zSig = bSig - aSig;
3948     zExp = bExp;
3949     zSign ^= 1;
3950     goto normalizeRoundAndPack;
3951  aExpBigger:
3952     if ( aExp == 0x7FF ) {
3953         if (aSig) {
3954             return propagateFloat64NaN(a, b, status);
3955         }
3956         return a;
3957     }
3958     if ( bExp == 0 ) {
3959         --expDiff;
3960     }
3961     else {
3962         bSig |= LIT64( 0x4000000000000000 );
3963     }
3964     shift64RightJamming( bSig, expDiff, &bSig );
3965     aSig |= LIT64( 0x4000000000000000 );
3966  aBigger:
3967     zSig = aSig - bSig;
3968     zExp = aExp;
3969  normalizeRoundAndPack:
3970     --zExp;
3971     return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3972 
3973 }
3974 
3975 /*----------------------------------------------------------------------------
3976 | Returns the result of adding the double-precision floating-point values `a'
3977 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3978 | Binary Floating-Point Arithmetic.
3979 *----------------------------------------------------------------------------*/
3980 
3981 float64 float64_add(float64 a, float64 b, float_status *status)
3982 {
3983     flag aSign, bSign;
3984     a = float64_squash_input_denormal(a, status);
3985     b = float64_squash_input_denormal(b, status);
3986 
3987     aSign = extractFloat64Sign( a );
3988     bSign = extractFloat64Sign( b );
3989     if ( aSign == bSign ) {
3990         return addFloat64Sigs(a, b, aSign, status);
3991     }
3992     else {
3993         return subFloat64Sigs(a, b, aSign, status);
3994     }
3995 
3996 }
3997 
3998 /*----------------------------------------------------------------------------
3999 | Returns the result of subtracting the double-precision floating-point values
4000 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4001 | for Binary Floating-Point Arithmetic.
4002 *----------------------------------------------------------------------------*/
4003 
4004 float64 float64_sub(float64 a, float64 b, float_status *status)
4005 {
4006     flag aSign, bSign;
4007     a = float64_squash_input_denormal(a, status);
4008     b = float64_squash_input_denormal(b, status);
4009 
4010     aSign = extractFloat64Sign( a );
4011     bSign = extractFloat64Sign( b );
4012     if ( aSign == bSign ) {
4013         return subFloat64Sigs(a, b, aSign, status);
4014     }
4015     else {
4016         return addFloat64Sigs(a, b, aSign, status);
4017     }
4018 
4019 }
4020 
4021 /*----------------------------------------------------------------------------
4022 | Returns the result of multiplying the double-precision floating-point values
4023 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4024 | for Binary Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4026 
4027 float64 float64_mul(float64 a, float64 b, float_status *status)
4028 {
4029     flag aSign, bSign, zSign;
4030     int aExp, bExp, zExp;
4031     uint64_t aSig, bSig, zSig0, zSig1;
4032 
4033     a = float64_squash_input_denormal(a, status);
4034     b = float64_squash_input_denormal(b, status);
4035 
4036     aSig = extractFloat64Frac( a );
4037     aExp = extractFloat64Exp( a );
4038     aSign = extractFloat64Sign( a );
4039     bSig = extractFloat64Frac( b );
4040     bExp = extractFloat64Exp( b );
4041     bSign = extractFloat64Sign( b );
4042     zSign = aSign ^ bSign;
4043     if ( aExp == 0x7FF ) {
4044         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4045             return propagateFloat64NaN(a, b, status);
4046         }
4047         if ( ( bExp | bSig ) == 0 ) {
4048             float_raise(float_flag_invalid, status);
4049             return float64_default_nan(status);
4050         }
4051         return packFloat64( zSign, 0x7FF, 0 );
4052     }
4053     if ( bExp == 0x7FF ) {
4054         if (bSig) {
4055             return propagateFloat64NaN(a, b, status);
4056         }
4057         if ( ( aExp | aSig ) == 0 ) {
4058             float_raise(float_flag_invalid, status);
4059             return float64_default_nan(status);
4060         }
4061         return packFloat64( zSign, 0x7FF, 0 );
4062     }
4063     if ( aExp == 0 ) {
4064         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4065         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4066     }
4067     if ( bExp == 0 ) {
4068         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4069         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4070     }
4071     zExp = aExp + bExp - 0x3FF;
4072     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4073     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4074     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4075     zSig0 |= ( zSig1 != 0 );
4076     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4077         zSig0 <<= 1;
4078         --zExp;
4079     }
4080     return roundAndPackFloat64(zSign, zExp, zSig0, status);
4081 
4082 }
4083 
4084 /*----------------------------------------------------------------------------
4085 | Returns the result of dividing the double-precision floating-point value `a'
4086 | by the corresponding value `b'.  The operation is performed according to
4087 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4088 *----------------------------------------------------------------------------*/
4089 
4090 float64 float64_div(float64 a, float64 b, float_status *status)
4091 {
4092     flag aSign, bSign, zSign;
4093     int aExp, bExp, zExp;
4094     uint64_t aSig, bSig, zSig;
4095     uint64_t rem0, rem1;
4096     uint64_t term0, term1;
4097     a = float64_squash_input_denormal(a, status);
4098     b = float64_squash_input_denormal(b, status);
4099 
4100     aSig = extractFloat64Frac( a );
4101     aExp = extractFloat64Exp( a );
4102     aSign = extractFloat64Sign( a );
4103     bSig = extractFloat64Frac( b );
4104     bExp = extractFloat64Exp( b );
4105     bSign = extractFloat64Sign( b );
4106     zSign = aSign ^ bSign;
4107     if ( aExp == 0x7FF ) {
4108         if (aSig) {
4109             return propagateFloat64NaN(a, b, status);
4110         }
4111         if ( bExp == 0x7FF ) {
4112             if (bSig) {
4113                 return propagateFloat64NaN(a, b, status);
4114             }
4115             float_raise(float_flag_invalid, status);
4116             return float64_default_nan(status);
4117         }
4118         return packFloat64( zSign, 0x7FF, 0 );
4119     }
4120     if ( bExp == 0x7FF ) {
4121         if (bSig) {
4122             return propagateFloat64NaN(a, b, status);
4123         }
4124         return packFloat64( zSign, 0, 0 );
4125     }
4126     if ( bExp == 0 ) {
4127         if ( bSig == 0 ) {
4128             if ( ( aExp | aSig ) == 0 ) {
4129                 float_raise(float_flag_invalid, status);
4130                 return float64_default_nan(status);
4131             }
4132             float_raise(float_flag_divbyzero, status);
4133             return packFloat64( zSign, 0x7FF, 0 );
4134         }
4135         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4136     }
4137     if ( aExp == 0 ) {
4138         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4139         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4140     }
4141     zExp = aExp - bExp + 0x3FD;
4142     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4143     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4144     if ( bSig <= ( aSig + aSig ) ) {
4145         aSig >>= 1;
4146         ++zExp;
4147     }
4148     zSig = estimateDiv128To64( aSig, 0, bSig );
4149     if ( ( zSig & 0x1FF ) <= 2 ) {
4150         mul64To128( bSig, zSig, &term0, &term1 );
4151         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4152         while ( (int64_t) rem0 < 0 ) {
4153             --zSig;
4154             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4155         }
4156         zSig |= ( rem1 != 0 );
4157     }
4158     return roundAndPackFloat64(zSign, zExp, zSig, status);
4159 
4160 }
4161 
4162 /*----------------------------------------------------------------------------
4163 | Returns the remainder of the double-precision floating-point value `a'
4164 | with respect to the corresponding value `b'.  The operation is performed
4165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4166 *----------------------------------------------------------------------------*/
4167 
4168 float64 float64_rem(float64 a, float64 b, float_status *status)
4169 {
4170     flag aSign, zSign;
4171     int aExp, bExp, expDiff;
4172     uint64_t aSig, bSig;
4173     uint64_t q, alternateASig;
4174     int64_t sigMean;
4175 
4176     a = float64_squash_input_denormal(a, status);
4177     b = float64_squash_input_denormal(b, status);
4178     aSig = extractFloat64Frac( a );
4179     aExp = extractFloat64Exp( a );
4180     aSign = extractFloat64Sign( a );
4181     bSig = extractFloat64Frac( b );
4182     bExp = extractFloat64Exp( b );
4183     if ( aExp == 0x7FF ) {
4184         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4185             return propagateFloat64NaN(a, b, status);
4186         }
4187         float_raise(float_flag_invalid, status);
4188         return float64_default_nan(status);
4189     }
4190     if ( bExp == 0x7FF ) {
4191         if (bSig) {
4192             return propagateFloat64NaN(a, b, status);
4193         }
4194         return a;
4195     }
4196     if ( bExp == 0 ) {
4197         if ( bSig == 0 ) {
4198             float_raise(float_flag_invalid, status);
4199             return float64_default_nan(status);
4200         }
4201         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4202     }
4203     if ( aExp == 0 ) {
4204         if ( aSig == 0 ) return a;
4205         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4206     }
4207     expDiff = aExp - bExp;
4208     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4209     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4210     if ( expDiff < 0 ) {
4211         if ( expDiff < -1 ) return a;
4212         aSig >>= 1;
4213     }
4214     q = ( bSig <= aSig );
4215     if ( q ) aSig -= bSig;
4216     expDiff -= 64;
4217     while ( 0 < expDiff ) {
4218         q = estimateDiv128To64( aSig, 0, bSig );
4219         q = ( 2 < q ) ? q - 2 : 0;
4220         aSig = - ( ( bSig>>2 ) * q );
4221         expDiff -= 62;
4222     }
4223     expDiff += 64;
4224     if ( 0 < expDiff ) {
4225         q = estimateDiv128To64( aSig, 0, bSig );
4226         q = ( 2 < q ) ? q - 2 : 0;
4227         q >>= 64 - expDiff;
4228         bSig >>= 2;
4229         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4230     }
4231     else {
4232         aSig >>= 2;
4233         bSig >>= 2;
4234     }
4235     do {
4236         alternateASig = aSig;
4237         ++q;
4238         aSig -= bSig;
4239     } while ( 0 <= (int64_t) aSig );
4240     sigMean = aSig + alternateASig;
4241     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4242         aSig = alternateASig;
4243     }
4244     zSign = ( (int64_t) aSig < 0 );
4245     if ( zSign ) aSig = - aSig;
4246     return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4247 
4248 }
4249 
4250 /*----------------------------------------------------------------------------
4251 | Returns the result of multiplying the double-precision floating-point values
4252 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4253 | multiplication.  The operation is performed according to the IEC/IEEE
4254 | Standard for Binary Floating-Point Arithmetic 754-2008.
4255 | The flags argument allows the caller to select negation of the
4256 | addend, the intermediate product, or the final result. (The difference
4257 | between this and having the caller do a separate negation is that negating
4258 | externally will flip the sign bit on NaNs.)
4259 *----------------------------------------------------------------------------*/
4260 
4261 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4262                        float_status *status)
4263 {
4264     flag aSign, bSign, cSign, zSign;
4265     int aExp, bExp, cExp, pExp, zExp, expDiff;
4266     uint64_t aSig, bSig, cSig;
4267     flag pInf, pZero, pSign;
4268     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4269     int shiftcount;
4270     flag signflip, infzero;
4271 
4272     a = float64_squash_input_denormal(a, status);
4273     b = float64_squash_input_denormal(b, status);
4274     c = float64_squash_input_denormal(c, status);
4275     aSig = extractFloat64Frac(a);
4276     aExp = extractFloat64Exp(a);
4277     aSign = extractFloat64Sign(a);
4278     bSig = extractFloat64Frac(b);
4279     bExp = extractFloat64Exp(b);
4280     bSign = extractFloat64Sign(b);
4281     cSig = extractFloat64Frac(c);
4282     cExp = extractFloat64Exp(c);
4283     cSign = extractFloat64Sign(c);
4284 
4285     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4286                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4287 
4288     /* It is implementation-defined whether the cases of (0,inf,qnan)
4289      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4290      * they return if they do), so we have to hand this information
4291      * off to the target-specific pick-a-NaN routine.
4292      */
4293     if (((aExp == 0x7ff) && aSig) ||
4294         ((bExp == 0x7ff) && bSig) ||
4295         ((cExp == 0x7ff) && cSig)) {
4296         return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4297     }
4298 
4299     if (infzero) {
4300         float_raise(float_flag_invalid, status);
4301         return float64_default_nan(status);
4302     }
4303 
4304     if (flags & float_muladd_negate_c) {
4305         cSign ^= 1;
4306     }
4307 
4308     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4309 
4310     /* Work out the sign and type of the product */
4311     pSign = aSign ^ bSign;
4312     if (flags & float_muladd_negate_product) {
4313         pSign ^= 1;
4314     }
4315     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4316     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4317 
4318     if (cExp == 0x7ff) {
4319         if (pInf && (pSign ^ cSign)) {
4320             /* addition of opposite-signed infinities => InvalidOperation */
4321             float_raise(float_flag_invalid, status);
4322             return float64_default_nan(status);
4323         }
4324         /* Otherwise generate an infinity of the same sign */
4325         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4326     }
4327 
4328     if (pInf) {
4329         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4330     }
4331 
4332     if (pZero) {
4333         if (cExp == 0) {
4334             if (cSig == 0) {
4335                 /* Adding two exact zeroes */
4336                 if (pSign == cSign) {
4337                     zSign = pSign;
4338                 } else if (status->float_rounding_mode == float_round_down) {
4339                     zSign = 1;
4340                 } else {
4341                     zSign = 0;
4342                 }
4343                 return packFloat64(zSign ^ signflip, 0, 0);
4344             }
4345             /* Exact zero plus a denorm */
4346             if (status->flush_to_zero) {
4347                 float_raise(float_flag_output_denormal, status);
4348                 return packFloat64(cSign ^ signflip, 0, 0);
4349             }
4350         }
4351         /* Zero plus something non-zero : just return the something */
4352         if (flags & float_muladd_halve_result) {
4353             if (cExp == 0) {
4354                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4355             }
4356             /* Subtract one to halve, and one again because roundAndPackFloat64
4357              * wants one less than the true exponent.
4358              */
4359             cExp -= 2;
4360             cSig = (cSig | 0x0010000000000000ULL) << 10;
4361             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4362         }
4363         return packFloat64(cSign ^ signflip, cExp, cSig);
4364     }
4365 
4366     if (aExp == 0) {
4367         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4368     }
4369     if (bExp == 0) {
4370         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4371     }
4372 
4373     /* Calculate the actual result a * b + c */
4374 
4375     /* Multiply first; this is easy. */
4376     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4377      * because we want the true exponent, not the "one-less-than"
4378      * flavour that roundAndPackFloat64() takes.
4379      */
4380     pExp = aExp + bExp - 0x3fe;
4381     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4382     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4383     mul64To128(aSig, bSig, &pSig0, &pSig1);
4384     if ((int64_t)(pSig0 << 1) >= 0) {
4385         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4386         pExp--;
4387     }
4388 
4389     zSign = pSign ^ signflip;
4390 
4391     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4392      * bit in position 126.
4393      */
4394     if (cExp == 0) {
4395         if (!cSig) {
4396             /* Throw out the special case of c being an exact zero now */
4397             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4398             if (flags & float_muladd_halve_result) {
4399                 pExp--;
4400             }
4401             return roundAndPackFloat64(zSign, pExp - 1,
4402                                        pSig1, status);
4403         }
4404         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4405     }
4406 
4407     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4408      * significand of the addend, with the explicit bit in position 126.
4409      */
4410     cSig0 = cSig << (126 - 64 - 52);
4411     cSig1 = 0;
4412     cSig0 |= LIT64(0x4000000000000000);
4413     expDiff = pExp - cExp;
4414 
4415     if (pSign == cSign) {
4416         /* Addition */
4417         if (expDiff > 0) {
4418             /* scale c to match p */
4419             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4420             zExp = pExp;
4421         } else if (expDiff < 0) {
4422             /* scale p to match c */
4423             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4424             zExp = cExp;
4425         } else {
4426             /* no scaling needed */
4427             zExp = cExp;
4428         }
4429         /* Add significands and make sure explicit bit ends up in posn 126 */
4430         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4431         if ((int64_t)zSig0 < 0) {
4432             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4433         } else {
4434             zExp--;
4435         }
4436         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4437         if (flags & float_muladd_halve_result) {
4438             zExp--;
4439         }
4440         return roundAndPackFloat64(zSign, zExp, zSig1, status);
4441     } else {
4442         /* Subtraction */
4443         if (expDiff > 0) {
4444             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4445             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4446             zExp = pExp;
4447         } else if (expDiff < 0) {
4448             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4449             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4450             zExp = cExp;
4451             zSign ^= 1;
4452         } else {
4453             zExp = pExp;
4454             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4455                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4456             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4457                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4458                 zSign ^= 1;
4459             } else {
4460                 /* Exact zero */
4461                 zSign = signflip;
4462                 if (status->float_rounding_mode == float_round_down) {
4463                     zSign ^= 1;
4464                 }
4465                 return packFloat64(zSign, 0, 0);
4466             }
4467         }
4468         --zExp;
4469         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4470          * starting with the significand in a pair of uint64_t.
4471          */
4472         if (zSig0) {
4473             shiftcount = countLeadingZeros64(zSig0) - 1;
4474             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4475             if (zSig1) {
4476                 zSig0 |= 1;
4477             }
4478             zExp -= shiftcount;
4479         } else {
4480             shiftcount = countLeadingZeros64(zSig1);
4481             if (shiftcount == 0) {
4482                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4483                 zExp -= 63;
4484             } else {
4485                 shiftcount--;
4486                 zSig0 = zSig1 << shiftcount;
4487                 zExp -= (shiftcount + 64);
4488             }
4489         }
4490         if (flags & float_muladd_halve_result) {
4491             zExp--;
4492         }
4493         return roundAndPackFloat64(zSign, zExp, zSig0, status);
4494     }
4495 }
4496 
4497 /*----------------------------------------------------------------------------
4498 | Returns the square root of the double-precision floating-point value `a'.
4499 | The operation is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502 
4503 float64 float64_sqrt(float64 a, float_status *status)
4504 {
4505     flag aSign;
4506     int aExp, zExp;
4507     uint64_t aSig, zSig, doubleZSig;
4508     uint64_t rem0, rem1, term0, term1;
4509     a = float64_squash_input_denormal(a, status);
4510 
4511     aSig = extractFloat64Frac( a );
4512     aExp = extractFloat64Exp( a );
4513     aSign = extractFloat64Sign( a );
4514     if ( aExp == 0x7FF ) {
4515         if (aSig) {
4516             return propagateFloat64NaN(a, a, status);
4517         }
4518         if ( ! aSign ) return a;
4519         float_raise(float_flag_invalid, status);
4520         return float64_default_nan(status);
4521     }
4522     if ( aSign ) {
4523         if ( ( aExp | aSig ) == 0 ) return a;
4524         float_raise(float_flag_invalid, status);
4525         return float64_default_nan(status);
4526     }
4527     if ( aExp == 0 ) {
4528         if ( aSig == 0 ) return float64_zero;
4529         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4530     }
4531     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4532     aSig |= LIT64( 0x0010000000000000 );
4533     zSig = estimateSqrt32( aExp, aSig>>21 );
4534     aSig <<= 9 - ( aExp & 1 );
4535     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4536     if ( ( zSig & 0x1FF ) <= 5 ) {
4537         doubleZSig = zSig<<1;
4538         mul64To128( zSig, zSig, &term0, &term1 );
4539         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4540         while ( (int64_t) rem0 < 0 ) {
4541             --zSig;
4542             doubleZSig -= 2;
4543             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4544         }
4545         zSig |= ( ( rem0 | rem1 ) != 0 );
4546     }
4547     return roundAndPackFloat64(0, zExp, zSig, status);
4548 
4549 }
4550 
4551 /*----------------------------------------------------------------------------
4552 | Returns the binary log of the double-precision floating-point value `a'.
4553 | The operation is performed according to the IEC/IEEE Standard for Binary
4554 | Floating-Point Arithmetic.
4555 *----------------------------------------------------------------------------*/
4556 float64 float64_log2(float64 a, float_status *status)
4557 {
4558     flag aSign, zSign;
4559     int aExp;
4560     uint64_t aSig, aSig0, aSig1, zSig, i;
4561     a = float64_squash_input_denormal(a, status);
4562 
4563     aSig = extractFloat64Frac( a );
4564     aExp = extractFloat64Exp( a );
4565     aSign = extractFloat64Sign( a );
4566 
4567     if ( aExp == 0 ) {
4568         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4569         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4570     }
4571     if ( aSign ) {
4572         float_raise(float_flag_invalid, status);
4573         return float64_default_nan(status);
4574     }
4575     if ( aExp == 0x7FF ) {
4576         if (aSig) {
4577             return propagateFloat64NaN(a, float64_zero, status);
4578         }
4579         return a;
4580     }
4581 
4582     aExp -= 0x3FF;
4583     aSig |= LIT64( 0x0010000000000000 );
4584     zSign = aExp < 0;
4585     zSig = (uint64_t)aExp << 52;
4586     for (i = 1LL << 51; i > 0; i >>= 1) {
4587         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4588         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4589         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4590             aSig >>= 1;
4591             zSig |= i;
4592         }
4593     }
4594 
4595     if ( zSign )
4596         zSig = -zSig;
4597     return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4598 }
4599 
4600 /*----------------------------------------------------------------------------
4601 | Returns 1 if the double-precision floating-point value `a' is equal to the
4602 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4603 | if either operand is a NaN.  Otherwise, the comparison is performed
4604 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4605 *----------------------------------------------------------------------------*/
4606 
4607 int float64_eq(float64 a, float64 b, float_status *status)
4608 {
4609     uint64_t av, bv;
4610     a = float64_squash_input_denormal(a, status);
4611     b = float64_squash_input_denormal(b, status);
4612 
4613     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4614          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4615        ) {
4616         float_raise(float_flag_invalid, status);
4617         return 0;
4618     }
4619     av = float64_val(a);
4620     bv = float64_val(b);
4621     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4622 
4623 }
4624 
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the double-precision floating-point value `a' is less than or
4627 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4628 | exception is raised if either operand is a NaN.  The comparison is performed
4629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631 
4632 int float64_le(float64 a, float64 b, float_status *status)
4633 {
4634     flag aSign, bSign;
4635     uint64_t av, bv;
4636     a = float64_squash_input_denormal(a, status);
4637     b = float64_squash_input_denormal(b, status);
4638 
4639     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4640          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4641        ) {
4642         float_raise(float_flag_invalid, status);
4643         return 0;
4644     }
4645     aSign = extractFloat64Sign( a );
4646     bSign = extractFloat64Sign( b );
4647     av = float64_val(a);
4648     bv = float64_val(b);
4649     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4650     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4651 
4652 }
4653 
4654 /*----------------------------------------------------------------------------
4655 | Returns 1 if the double-precision floating-point value `a' is less than
4656 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4657 | raised if either operand is a NaN.  The comparison is performed according
4658 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4659 *----------------------------------------------------------------------------*/
4660 
4661 int float64_lt(float64 a, float64 b, float_status *status)
4662 {
4663     flag aSign, bSign;
4664     uint64_t av, bv;
4665 
4666     a = float64_squash_input_denormal(a, status);
4667     b = float64_squash_input_denormal(b, status);
4668     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4669          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4670        ) {
4671         float_raise(float_flag_invalid, status);
4672         return 0;
4673     }
4674     aSign = extractFloat64Sign( a );
4675     bSign = extractFloat64Sign( b );
4676     av = float64_val(a);
4677     bv = float64_val(b);
4678     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4679     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4680 
4681 }
4682 
4683 /*----------------------------------------------------------------------------
4684 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4685 | be compared, and 0 otherwise.  The invalid exception is raised if either
4686 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4687 | Standard for Binary Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4689 
4690 int float64_unordered(float64 a, float64 b, float_status *status)
4691 {
4692     a = float64_squash_input_denormal(a, status);
4693     b = float64_squash_input_denormal(b, status);
4694 
4695     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4696          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4697        ) {
4698         float_raise(float_flag_invalid, status);
4699         return 1;
4700     }
4701     return 0;
4702 }
4703 
4704 /*----------------------------------------------------------------------------
4705 | Returns 1 if the double-precision floating-point value `a' is equal to the
4706 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4707 | exception.The comparison is performed according to the IEC/IEEE Standard
4708 | for Binary Floating-Point Arithmetic.
4709 *----------------------------------------------------------------------------*/
4710 
4711 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4712 {
4713     uint64_t av, bv;
4714     a = float64_squash_input_denormal(a, status);
4715     b = float64_squash_input_denormal(b, status);
4716 
4717     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4718          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4719        ) {
4720         if (float64_is_signaling_nan(a, status)
4721          || float64_is_signaling_nan(b, status)) {
4722             float_raise(float_flag_invalid, status);
4723         }
4724         return 0;
4725     }
4726     av = float64_val(a);
4727     bv = float64_val(b);
4728     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4729 
4730 }
4731 
4732 /*----------------------------------------------------------------------------
4733 | Returns 1 if the double-precision floating-point value `a' is less than or
4734 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4735 | cause an exception.  Otherwise, the comparison is performed according to the
4736 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4737 *----------------------------------------------------------------------------*/
4738 
4739 int float64_le_quiet(float64 a, float64 b, float_status *status)
4740 {
4741     flag aSign, bSign;
4742     uint64_t av, bv;
4743     a = float64_squash_input_denormal(a, status);
4744     b = float64_squash_input_denormal(b, status);
4745 
4746     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4747          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4748        ) {
4749         if (float64_is_signaling_nan(a, status)
4750          || float64_is_signaling_nan(b, status)) {
4751             float_raise(float_flag_invalid, status);
4752         }
4753         return 0;
4754     }
4755     aSign = extractFloat64Sign( a );
4756     bSign = extractFloat64Sign( b );
4757     av = float64_val(a);
4758     bv = float64_val(b);
4759     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4760     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4761 
4762 }
4763 
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the double-precision floating-point value `a' is less than
4766 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4767 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4768 | Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770 
4771 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4772 {
4773     flag aSign, bSign;
4774     uint64_t av, bv;
4775     a = float64_squash_input_denormal(a, status);
4776     b = float64_squash_input_denormal(b, status);
4777 
4778     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4779          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4780        ) {
4781         if (float64_is_signaling_nan(a, status)
4782          || float64_is_signaling_nan(b, status)) {
4783             float_raise(float_flag_invalid, status);
4784         }
4785         return 0;
4786     }
4787     aSign = extractFloat64Sign( a );
4788     bSign = extractFloat64Sign( b );
4789     av = float64_val(a);
4790     bv = float64_val(b);
4791     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4792     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4793 
4794 }
4795 
4796 /*----------------------------------------------------------------------------
4797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4798 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4799 | comparison is performed according to the IEC/IEEE Standard for Binary
4800 | Floating-Point Arithmetic.
4801 *----------------------------------------------------------------------------*/
4802 
4803 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4804 {
4805     a = float64_squash_input_denormal(a, status);
4806     b = float64_squash_input_denormal(b, status);
4807 
4808     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4809          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4810        ) {
4811         if (float64_is_signaling_nan(a, status)
4812          || float64_is_signaling_nan(b, status)) {
4813             float_raise(float_flag_invalid, status);
4814         }
4815         return 1;
4816     }
4817     return 0;
4818 }
4819 
4820 /*----------------------------------------------------------------------------
4821 | Returns the result of converting the extended double-precision floating-
4822 | point value `a' to the 32-bit two's complement integer format.  The
4823 | conversion is performed according to the IEC/IEEE Standard for Binary
4824 | Floating-Point Arithmetic---which means in particular that the conversion
4825 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4826 | largest positive integer is returned.  Otherwise, if the conversion
4827 | overflows, the largest integer with the same sign as `a' is returned.
4828 *----------------------------------------------------------------------------*/
4829 
4830 int32_t floatx80_to_int32(floatx80 a, float_status *status)
4831 {
4832     flag aSign;
4833     int32_t aExp, shiftCount;
4834     uint64_t aSig;
4835 
4836     if (floatx80_invalid_encoding(a)) {
4837         float_raise(float_flag_invalid, status);
4838         return 1 << 31;
4839     }
4840     aSig = extractFloatx80Frac( a );
4841     aExp = extractFloatx80Exp( a );
4842     aSign = extractFloatx80Sign( a );
4843     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4844     shiftCount = 0x4037 - aExp;
4845     if ( shiftCount <= 0 ) shiftCount = 1;
4846     shift64RightJamming( aSig, shiftCount, &aSig );
4847     return roundAndPackInt32(aSign, aSig, status);
4848 
4849 }
4850 
4851 /*----------------------------------------------------------------------------
4852 | Returns the result of converting the extended double-precision floating-
4853 | point value `a' to the 32-bit two's complement integer format.  The
4854 | conversion is performed according to the IEC/IEEE Standard for Binary
4855 | Floating-Point Arithmetic, except that the conversion is always rounded
4856 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4857 | Otherwise, if the conversion overflows, the largest integer with the same
4858 | sign as `a' is returned.
4859 *----------------------------------------------------------------------------*/
4860 
4861 int32_t floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4862 {
4863     flag aSign;
4864     int32_t aExp, shiftCount;
4865     uint64_t aSig, savedASig;
4866     int32_t z;
4867 
4868     if (floatx80_invalid_encoding(a)) {
4869         float_raise(float_flag_invalid, status);
4870         return 1 << 31;
4871     }
4872     aSig = extractFloatx80Frac( a );
4873     aExp = extractFloatx80Exp( a );
4874     aSign = extractFloatx80Sign( a );
4875     if ( 0x401E < aExp ) {
4876         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4877         goto invalid;
4878     }
4879     else if ( aExp < 0x3FFF ) {
4880         if (aExp || aSig) {
4881             status->float_exception_flags |= float_flag_inexact;
4882         }
4883         return 0;
4884     }
4885     shiftCount = 0x403E - aExp;
4886     savedASig = aSig;
4887     aSig >>= shiftCount;
4888     z = aSig;
4889     if ( aSign ) z = - z;
4890     if ( ( z < 0 ) ^ aSign ) {
4891  invalid:
4892         float_raise(float_flag_invalid, status);
4893         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4894     }
4895     if ( ( aSig<<shiftCount ) != savedASig ) {
4896         status->float_exception_flags |= float_flag_inexact;
4897     }
4898     return z;
4899 
4900 }
4901 
4902 /*----------------------------------------------------------------------------
4903 | Returns the result of converting the extended double-precision floating-
4904 | point value `a' to the 64-bit two's complement integer format.  The
4905 | conversion is performed according to the IEC/IEEE Standard for Binary
4906 | Floating-Point Arithmetic---which means in particular that the conversion
4907 | is rounded according to the current rounding mode.  If `a' is a NaN,
4908 | the largest positive integer is returned.  Otherwise, if the conversion
4909 | overflows, the largest integer with the same sign as `a' is returned.
4910 *----------------------------------------------------------------------------*/
4911 
4912 int64_t floatx80_to_int64(floatx80 a, float_status *status)
4913 {
4914     flag aSign;
4915     int32_t aExp, shiftCount;
4916     uint64_t aSig, aSigExtra;
4917 
4918     if (floatx80_invalid_encoding(a)) {
4919         float_raise(float_flag_invalid, status);
4920         return 1ULL << 63;
4921     }
4922     aSig = extractFloatx80Frac( a );
4923     aExp = extractFloatx80Exp( a );
4924     aSign = extractFloatx80Sign( a );
4925     shiftCount = 0x403E - aExp;
4926     if ( shiftCount <= 0 ) {
4927         if ( shiftCount ) {
4928             float_raise(float_flag_invalid, status);
4929             if (    ! aSign
4930                  || (    ( aExp == 0x7FFF )
4931                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4932                ) {
4933                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934             }
4935             return (int64_t) LIT64( 0x8000000000000000 );
4936         }
4937         aSigExtra = 0;
4938     }
4939     else {
4940         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4941     }
4942     return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4943 
4944 }
4945 
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the extended double-precision floating-
4948 | point value `a' to the 64-bit two's complement integer format.  The
4949 | conversion is performed according to the IEC/IEEE Standard for Binary
4950 | Floating-Point Arithmetic, except that the conversion is always rounded
4951 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4952 | Otherwise, if the conversion overflows, the largest integer with the same
4953 | sign as `a' is returned.
4954 *----------------------------------------------------------------------------*/
4955 
4956 int64_t floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4957 {
4958     flag aSign;
4959     int32_t aExp, shiftCount;
4960     uint64_t aSig;
4961     int64_t z;
4962 
4963     if (floatx80_invalid_encoding(a)) {
4964         float_raise(float_flag_invalid, status);
4965         return 1ULL << 63;
4966     }
4967     aSig = extractFloatx80Frac( a );
4968     aExp = extractFloatx80Exp( a );
4969     aSign = extractFloatx80Sign( a );
4970     shiftCount = aExp - 0x403E;
4971     if ( 0 <= shiftCount ) {
4972         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4973         if ( ( a.high != 0xC03E ) || aSig ) {
4974             float_raise(float_flag_invalid, status);
4975             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4976                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4977             }
4978         }
4979         return (int64_t) LIT64( 0x8000000000000000 );
4980     }
4981     else if ( aExp < 0x3FFF ) {
4982         if (aExp | aSig) {
4983             status->float_exception_flags |= float_flag_inexact;
4984         }
4985         return 0;
4986     }
4987     z = aSig>>( - shiftCount );
4988     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4989         status->float_exception_flags |= float_flag_inexact;
4990     }
4991     if ( aSign ) z = - z;
4992     return z;
4993 
4994 }
4995 
4996 /*----------------------------------------------------------------------------
4997 | Returns the result of converting the extended double-precision floating-
4998 | point value `a' to the single-precision floating-point format.  The
4999 | conversion is performed according to the IEC/IEEE Standard for Binary
5000 | Floating-Point Arithmetic.
5001 *----------------------------------------------------------------------------*/
5002 
5003 float32 floatx80_to_float32(floatx80 a, float_status *status)
5004 {
5005     flag aSign;
5006     int32_t aExp;
5007     uint64_t aSig;
5008 
5009     if (floatx80_invalid_encoding(a)) {
5010         float_raise(float_flag_invalid, status);
5011         return float32_default_nan(status);
5012     }
5013     aSig = extractFloatx80Frac( a );
5014     aExp = extractFloatx80Exp( a );
5015     aSign = extractFloatx80Sign( a );
5016     if ( aExp == 0x7FFF ) {
5017         if ( (uint64_t) ( aSig<<1 ) ) {
5018             return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
5019         }
5020         return packFloat32( aSign, 0xFF, 0 );
5021     }
5022     shift64RightJamming( aSig, 33, &aSig );
5023     if ( aExp || aSig ) aExp -= 0x3F81;
5024     return roundAndPackFloat32(aSign, aExp, aSig, status);
5025 
5026 }
5027 
5028 /*----------------------------------------------------------------------------
5029 | Returns the result of converting the extended double-precision floating-
5030 | point value `a' to the double-precision floating-point format.  The
5031 | conversion is performed according to the IEC/IEEE Standard for Binary
5032 | Floating-Point Arithmetic.
5033 *----------------------------------------------------------------------------*/
5034 
5035 float64 floatx80_to_float64(floatx80 a, float_status *status)
5036 {
5037     flag aSign;
5038     int32_t aExp;
5039     uint64_t aSig, zSig;
5040 
5041     if (floatx80_invalid_encoding(a)) {
5042         float_raise(float_flag_invalid, status);
5043         return float64_default_nan(status);
5044     }
5045     aSig = extractFloatx80Frac( a );
5046     aExp = extractFloatx80Exp( a );
5047     aSign = extractFloatx80Sign( a );
5048     if ( aExp == 0x7FFF ) {
5049         if ( (uint64_t) ( aSig<<1 ) ) {
5050             return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
5051         }
5052         return packFloat64( aSign, 0x7FF, 0 );
5053     }
5054     shift64RightJamming( aSig, 1, &zSig );
5055     if ( aExp || aSig ) aExp -= 0x3C01;
5056     return roundAndPackFloat64(aSign, aExp, zSig, status);
5057 
5058 }
5059 
5060 /*----------------------------------------------------------------------------
5061 | Returns the result of converting the extended double-precision floating-
5062 | point value `a' to the quadruple-precision floating-point format.  The
5063 | conversion is performed according to the IEC/IEEE Standard for Binary
5064 | Floating-Point Arithmetic.
5065 *----------------------------------------------------------------------------*/
5066 
5067 float128 floatx80_to_float128(floatx80 a, float_status *status)
5068 {
5069     flag aSign;
5070     int aExp;
5071     uint64_t aSig, zSig0, zSig1;
5072 
5073     if (floatx80_invalid_encoding(a)) {
5074         float_raise(float_flag_invalid, status);
5075         return float128_default_nan(status);
5076     }
5077     aSig = extractFloatx80Frac( a );
5078     aExp = extractFloatx80Exp( a );
5079     aSign = extractFloatx80Sign( a );
5080     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5081         return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5082     }
5083     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5084     return packFloat128( aSign, aExp, zSig0, zSig1 );
5085 
5086 }
5087 
5088 /*----------------------------------------------------------------------------
5089 | Rounds the extended double-precision floating-point value `a'
5090 | to the precision provided by floatx80_rounding_precision and returns the
5091 | result as an extended double-precision floating-point value.
5092 | The operation is performed according to the IEC/IEEE Standard for Binary
5093 | Floating-Point Arithmetic.
5094 *----------------------------------------------------------------------------*/
5095 
5096 floatx80 floatx80_round(floatx80 a, float_status *status)
5097 {
5098     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5099                                 extractFloatx80Sign(a),
5100                                 extractFloatx80Exp(a),
5101                                 extractFloatx80Frac(a), 0, status);
5102 }
5103 
5104 /*----------------------------------------------------------------------------
5105 | Rounds the extended double-precision floating-point value `a' to an integer,
5106 | and returns the result as an extended quadruple-precision floating-point
5107 | value.  The operation is performed according to the IEC/IEEE Standard for
5108 | Binary Floating-Point Arithmetic.
5109 *----------------------------------------------------------------------------*/
5110 
5111 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5112 {
5113     flag aSign;
5114     int32_t aExp;
5115     uint64_t lastBitMask, roundBitsMask;
5116     floatx80 z;
5117 
5118     if (floatx80_invalid_encoding(a)) {
5119         float_raise(float_flag_invalid, status);
5120         return floatx80_default_nan(status);
5121     }
5122     aExp = extractFloatx80Exp( a );
5123     if ( 0x403E <= aExp ) {
5124         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5125             return propagateFloatx80NaN(a, a, status);
5126         }
5127         return a;
5128     }
5129     if ( aExp < 0x3FFF ) {
5130         if (    ( aExp == 0 )
5131              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5132             return a;
5133         }
5134         status->float_exception_flags |= float_flag_inexact;
5135         aSign = extractFloatx80Sign( a );
5136         switch (status->float_rounding_mode) {
5137          case float_round_nearest_even:
5138             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5139                ) {
5140                 return
5141                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5142             }
5143             break;
5144         case float_round_ties_away:
5145             if (aExp == 0x3FFE) {
5146                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5147             }
5148             break;
5149          case float_round_down:
5150             return
5151                   aSign ?
5152                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5153                 : packFloatx80( 0, 0, 0 );
5154          case float_round_up:
5155             return
5156                   aSign ? packFloatx80( 1, 0, 0 )
5157                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5158         }
5159         return packFloatx80( aSign, 0, 0 );
5160     }
5161     lastBitMask = 1;
5162     lastBitMask <<= 0x403E - aExp;
5163     roundBitsMask = lastBitMask - 1;
5164     z = a;
5165     switch (status->float_rounding_mode) {
5166     case float_round_nearest_even:
5167         z.low += lastBitMask>>1;
5168         if ((z.low & roundBitsMask) == 0) {
5169             z.low &= ~lastBitMask;
5170         }
5171         break;
5172     case float_round_ties_away:
5173         z.low += lastBitMask >> 1;
5174         break;
5175     case float_round_to_zero:
5176         break;
5177     case float_round_up:
5178         if (!extractFloatx80Sign(z)) {
5179             z.low += roundBitsMask;
5180         }
5181         break;
5182     case float_round_down:
5183         if (extractFloatx80Sign(z)) {
5184             z.low += roundBitsMask;
5185         }
5186         break;
5187     default:
5188         abort();
5189     }
5190     z.low &= ~ roundBitsMask;
5191     if ( z.low == 0 ) {
5192         ++z.high;
5193         z.low = LIT64( 0x8000000000000000 );
5194     }
5195     if (z.low != a.low) {
5196         status->float_exception_flags |= float_flag_inexact;
5197     }
5198     return z;
5199 
5200 }
5201 
5202 /*----------------------------------------------------------------------------
5203 | Returns the result of adding the absolute values of the extended double-
5204 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
5205 | negated before being returned.  `zSign' is ignored if the result is a NaN.
5206 | The addition is performed according to the IEC/IEEE Standard for Binary
5207 | Floating-Point Arithmetic.
5208 *----------------------------------------------------------------------------*/
5209 
5210 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5211                                 float_status *status)
5212 {
5213     int32_t aExp, bExp, zExp;
5214     uint64_t aSig, bSig, zSig0, zSig1;
5215     int32_t expDiff;
5216 
5217     aSig = extractFloatx80Frac( a );
5218     aExp = extractFloatx80Exp( a );
5219     bSig = extractFloatx80Frac( b );
5220     bExp = extractFloatx80Exp( b );
5221     expDiff = aExp - bExp;
5222     if ( 0 < expDiff ) {
5223         if ( aExp == 0x7FFF ) {
5224             if ((uint64_t)(aSig << 1)) {
5225                 return propagateFloatx80NaN(a, b, status);
5226             }
5227             return a;
5228         }
5229         if ( bExp == 0 ) --expDiff;
5230         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5231         zExp = aExp;
5232     }
5233     else if ( expDiff < 0 ) {
5234         if ( bExp == 0x7FFF ) {
5235             if ((uint64_t)(bSig << 1)) {
5236                 return propagateFloatx80NaN(a, b, status);
5237             }
5238             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5239         }
5240         if ( aExp == 0 ) ++expDiff;
5241         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5242         zExp = bExp;
5243     }
5244     else {
5245         if ( aExp == 0x7FFF ) {
5246             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5247                 return propagateFloatx80NaN(a, b, status);
5248             }
5249             return a;
5250         }
5251         zSig1 = 0;
5252         zSig0 = aSig + bSig;
5253         if ( aExp == 0 ) {
5254             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5255             goto roundAndPack;
5256         }
5257         zExp = aExp;
5258         goto shiftRight1;
5259     }
5260     zSig0 = aSig + bSig;
5261     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5262  shiftRight1:
5263     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5264     zSig0 |= LIT64( 0x8000000000000000 );
5265     ++zExp;
5266  roundAndPack:
5267     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5268                                 zSign, zExp, zSig0, zSig1, status);
5269 }
5270 
5271 /*----------------------------------------------------------------------------
5272 | Returns the result of subtracting the absolute values of the extended
5273 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
5274 | difference is negated before being returned.  `zSign' is ignored if the
5275 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5276 | Standard for Binary Floating-Point Arithmetic.
5277 *----------------------------------------------------------------------------*/
5278 
5279 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5280                                 float_status *status)
5281 {
5282     int32_t aExp, bExp, zExp;
5283     uint64_t aSig, bSig, zSig0, zSig1;
5284     int32_t expDiff;
5285 
5286     aSig = extractFloatx80Frac( a );
5287     aExp = extractFloatx80Exp( a );
5288     bSig = extractFloatx80Frac( b );
5289     bExp = extractFloatx80Exp( b );
5290     expDiff = aExp - bExp;
5291     if ( 0 < expDiff ) goto aExpBigger;
5292     if ( expDiff < 0 ) goto bExpBigger;
5293     if ( aExp == 0x7FFF ) {
5294         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5295             return propagateFloatx80NaN(a, b, status);
5296         }
5297         float_raise(float_flag_invalid, status);
5298         return floatx80_default_nan(status);
5299     }
5300     if ( aExp == 0 ) {
5301         aExp = 1;
5302         bExp = 1;
5303     }
5304     zSig1 = 0;
5305     if ( bSig < aSig ) goto aBigger;
5306     if ( aSig < bSig ) goto bBigger;
5307     return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5308  bExpBigger:
5309     if ( bExp == 0x7FFF ) {
5310         if ((uint64_t)(bSig << 1)) {
5311             return propagateFloatx80NaN(a, b, status);
5312         }
5313         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5314     }
5315     if ( aExp == 0 ) ++expDiff;
5316     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5317  bBigger:
5318     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5319     zExp = bExp;
5320     zSign ^= 1;
5321     goto normalizeRoundAndPack;
5322  aExpBigger:
5323     if ( aExp == 0x7FFF ) {
5324         if ((uint64_t)(aSig << 1)) {
5325             return propagateFloatx80NaN(a, b, status);
5326         }
5327         return a;
5328     }
5329     if ( bExp == 0 ) --expDiff;
5330     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5331  aBigger:
5332     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5333     zExp = aExp;
5334  normalizeRoundAndPack:
5335     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5336                                          zSign, zExp, zSig0, zSig1, status);
5337 }
5338 
5339 /*----------------------------------------------------------------------------
5340 | Returns the result of adding the extended double-precision floating-point
5341 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5342 | Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 
5345 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5346 {
5347     flag aSign, bSign;
5348 
5349     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5350         float_raise(float_flag_invalid, status);
5351         return floatx80_default_nan(status);
5352     }
5353     aSign = extractFloatx80Sign( a );
5354     bSign = extractFloatx80Sign( b );
5355     if ( aSign == bSign ) {
5356         return addFloatx80Sigs(a, b, aSign, status);
5357     }
5358     else {
5359         return subFloatx80Sigs(a, b, aSign, status);
5360     }
5361 
5362 }
5363 
5364 /*----------------------------------------------------------------------------
5365 | Returns the result of subtracting the extended double-precision floating-
5366 | point values `a' and `b'.  The operation is performed according to the
5367 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5368 *----------------------------------------------------------------------------*/
5369 
5370 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5371 {
5372     flag aSign, bSign;
5373 
5374     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5375         float_raise(float_flag_invalid, status);
5376         return floatx80_default_nan(status);
5377     }
5378     aSign = extractFloatx80Sign( a );
5379     bSign = extractFloatx80Sign( b );
5380     if ( aSign == bSign ) {
5381         return subFloatx80Sigs(a, b, aSign, status);
5382     }
5383     else {
5384         return addFloatx80Sigs(a, b, aSign, status);
5385     }
5386 
5387 }
5388 
5389 /*----------------------------------------------------------------------------
5390 | Returns the result of multiplying the extended double-precision floating-
5391 | point values `a' and `b'.  The operation is performed according to the
5392 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5393 *----------------------------------------------------------------------------*/
5394 
5395 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5396 {
5397     flag aSign, bSign, zSign;
5398     int32_t aExp, bExp, zExp;
5399     uint64_t aSig, bSig, zSig0, zSig1;
5400 
5401     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5402         float_raise(float_flag_invalid, status);
5403         return floatx80_default_nan(status);
5404     }
5405     aSig = extractFloatx80Frac( a );
5406     aExp = extractFloatx80Exp( a );
5407     aSign = extractFloatx80Sign( a );
5408     bSig = extractFloatx80Frac( b );
5409     bExp = extractFloatx80Exp( b );
5410     bSign = extractFloatx80Sign( b );
5411     zSign = aSign ^ bSign;
5412     if ( aExp == 0x7FFF ) {
5413         if (    (uint64_t) ( aSig<<1 )
5414              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5415             return propagateFloatx80NaN(a, b, status);
5416         }
5417         if ( ( bExp | bSig ) == 0 ) goto invalid;
5418         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5419     }
5420     if ( bExp == 0x7FFF ) {
5421         if ((uint64_t)(bSig << 1)) {
5422             return propagateFloatx80NaN(a, b, status);
5423         }
5424         if ( ( aExp | aSig ) == 0 ) {
5425  invalid:
5426             float_raise(float_flag_invalid, status);
5427             return floatx80_default_nan(status);
5428         }
5429         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5430     }
5431     if ( aExp == 0 ) {
5432         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5433         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5434     }
5435     if ( bExp == 0 ) {
5436         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5437         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5438     }
5439     zExp = aExp + bExp - 0x3FFE;
5440     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5441     if ( 0 < (int64_t) zSig0 ) {
5442         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5443         --zExp;
5444     }
5445     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5446                                 zSign, zExp, zSig0, zSig1, status);
5447 }
5448 
5449 /*----------------------------------------------------------------------------
5450 | Returns the result of dividing the extended double-precision floating-point
5451 | value `a' by the corresponding value `b'.  The operation is performed
5452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5453 *----------------------------------------------------------------------------*/
5454 
5455 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5456 {
5457     flag aSign, bSign, zSign;
5458     int32_t aExp, bExp, zExp;
5459     uint64_t aSig, bSig, zSig0, zSig1;
5460     uint64_t rem0, rem1, rem2, term0, term1, term2;
5461 
5462     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5463         float_raise(float_flag_invalid, status);
5464         return floatx80_default_nan(status);
5465     }
5466     aSig = extractFloatx80Frac( a );
5467     aExp = extractFloatx80Exp( a );
5468     aSign = extractFloatx80Sign( a );
5469     bSig = extractFloatx80Frac( b );
5470     bExp = extractFloatx80Exp( b );
5471     bSign = extractFloatx80Sign( b );
5472     zSign = aSign ^ bSign;
5473     if ( aExp == 0x7FFF ) {
5474         if ((uint64_t)(aSig << 1)) {
5475             return propagateFloatx80NaN(a, b, status);
5476         }
5477         if ( bExp == 0x7FFF ) {
5478             if ((uint64_t)(bSig << 1)) {
5479                 return propagateFloatx80NaN(a, b, status);
5480             }
5481             goto invalid;
5482         }
5483         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5484     }
5485     if ( bExp == 0x7FFF ) {
5486         if ((uint64_t)(bSig << 1)) {
5487             return propagateFloatx80NaN(a, b, status);
5488         }
5489         return packFloatx80( zSign, 0, 0 );
5490     }
5491     if ( bExp == 0 ) {
5492         if ( bSig == 0 ) {
5493             if ( ( aExp | aSig ) == 0 ) {
5494  invalid:
5495                 float_raise(float_flag_invalid, status);
5496                 return floatx80_default_nan(status);
5497             }
5498             float_raise(float_flag_divbyzero, status);
5499             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5500         }
5501         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5502     }
5503     if ( aExp == 0 ) {
5504         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5505         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5506     }
5507     zExp = aExp - bExp + 0x3FFE;
5508     rem1 = 0;
5509     if ( bSig <= aSig ) {
5510         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5511         ++zExp;
5512     }
5513     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5514     mul64To128( bSig, zSig0, &term0, &term1 );
5515     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5516     while ( (int64_t) rem0 < 0 ) {
5517         --zSig0;
5518         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5519     }
5520     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5521     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5522         mul64To128( bSig, zSig1, &term1, &term2 );
5523         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5524         while ( (int64_t) rem1 < 0 ) {
5525             --zSig1;
5526             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5527         }
5528         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5529     }
5530     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5531                                 zSign, zExp, zSig0, zSig1, status);
5532 }
5533 
5534 /*----------------------------------------------------------------------------
5535 | Returns the remainder of the extended double-precision floating-point value
5536 | `a' with respect to the corresponding value `b'.  The operation is performed
5537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5538 *----------------------------------------------------------------------------*/
5539 
5540 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5541 {
5542     flag aSign, zSign;
5543     int32_t aExp, bExp, expDiff;
5544     uint64_t aSig0, aSig1, bSig;
5545     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5546 
5547     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5548         float_raise(float_flag_invalid, status);
5549         return floatx80_default_nan(status);
5550     }
5551     aSig0 = extractFloatx80Frac( a );
5552     aExp = extractFloatx80Exp( a );
5553     aSign = extractFloatx80Sign( a );
5554     bSig = extractFloatx80Frac( b );
5555     bExp = extractFloatx80Exp( b );
5556     if ( aExp == 0x7FFF ) {
5557         if (    (uint64_t) ( aSig0<<1 )
5558              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5559             return propagateFloatx80NaN(a, b, status);
5560         }
5561         goto invalid;
5562     }
5563     if ( bExp == 0x7FFF ) {
5564         if ((uint64_t)(bSig << 1)) {
5565             return propagateFloatx80NaN(a, b, status);
5566         }
5567         return a;
5568     }
5569     if ( bExp == 0 ) {
5570         if ( bSig == 0 ) {
5571  invalid:
5572             float_raise(float_flag_invalid, status);
5573             return floatx80_default_nan(status);
5574         }
5575         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5576     }
5577     if ( aExp == 0 ) {
5578         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5579         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5580     }
5581     bSig |= LIT64( 0x8000000000000000 );
5582     zSign = aSign;
5583     expDiff = aExp - bExp;
5584     aSig1 = 0;
5585     if ( expDiff < 0 ) {
5586         if ( expDiff < -1 ) return a;
5587         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5588         expDiff = 0;
5589     }
5590     q = ( bSig <= aSig0 );
5591     if ( q ) aSig0 -= bSig;
5592     expDiff -= 64;
5593     while ( 0 < expDiff ) {
5594         q = estimateDiv128To64( aSig0, aSig1, bSig );
5595         q = ( 2 < q ) ? q - 2 : 0;
5596         mul64To128( bSig, q, &term0, &term1 );
5597         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5598         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5599         expDiff -= 62;
5600     }
5601     expDiff += 64;
5602     if ( 0 < expDiff ) {
5603         q = estimateDiv128To64( aSig0, aSig1, bSig );
5604         q = ( 2 < q ) ? q - 2 : 0;
5605         q >>= 64 - expDiff;
5606         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5607         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5608         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5609         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5610             ++q;
5611             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5612         }
5613     }
5614     else {
5615         term1 = 0;
5616         term0 = bSig;
5617     }
5618     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5619     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5620          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5621               && ( q & 1 ) )
5622        ) {
5623         aSig0 = alternateASig0;
5624         aSig1 = alternateASig1;
5625         zSign = ! zSign;
5626     }
5627     return
5628         normalizeRoundAndPackFloatx80(
5629             80, zSign, bExp + expDiff, aSig0, aSig1, status);
5630 
5631 }
5632 
5633 /*----------------------------------------------------------------------------
5634 | Returns the square root of the extended double-precision floating-point
5635 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5636 | for Binary Floating-Point Arithmetic.
5637 *----------------------------------------------------------------------------*/
5638 
5639 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5640 {
5641     flag aSign;
5642     int32_t aExp, zExp;
5643     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5644     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5645 
5646     if (floatx80_invalid_encoding(a)) {
5647         float_raise(float_flag_invalid, status);
5648         return floatx80_default_nan(status);
5649     }
5650     aSig0 = extractFloatx80Frac( a );
5651     aExp = extractFloatx80Exp( a );
5652     aSign = extractFloatx80Sign( a );
5653     if ( aExp == 0x7FFF ) {
5654         if ((uint64_t)(aSig0 << 1)) {
5655             return propagateFloatx80NaN(a, a, status);
5656         }
5657         if ( ! aSign ) return a;
5658         goto invalid;
5659     }
5660     if ( aSign ) {
5661         if ( ( aExp | aSig0 ) == 0 ) return a;
5662  invalid:
5663         float_raise(float_flag_invalid, status);
5664         return floatx80_default_nan(status);
5665     }
5666     if ( aExp == 0 ) {
5667         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5668         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5669     }
5670     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5671     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5672     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5673     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5674     doubleZSig0 = zSig0<<1;
5675     mul64To128( zSig0, zSig0, &term0, &term1 );
5676     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5677     while ( (int64_t) rem0 < 0 ) {
5678         --zSig0;
5679         doubleZSig0 -= 2;
5680         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5681     }
5682     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5683     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5684         if ( zSig1 == 0 ) zSig1 = 1;
5685         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5686         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5687         mul64To128( zSig1, zSig1, &term2, &term3 );
5688         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5689         while ( (int64_t) rem1 < 0 ) {
5690             --zSig1;
5691             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5692             term3 |= 1;
5693             term2 |= doubleZSig0;
5694             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5695         }
5696         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5697     }
5698     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5699     zSig0 |= doubleZSig0;
5700     return roundAndPackFloatx80(status->floatx80_rounding_precision,
5701                                 0, zExp, zSig0, zSig1, status);
5702 }
5703 
5704 /*----------------------------------------------------------------------------
5705 | Returns 1 if the extended double-precision floating-point value `a' is equal
5706 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5707 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5708 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709 *----------------------------------------------------------------------------*/
5710 
5711 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5712 {
5713 
5714     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5715         || (extractFloatx80Exp(a) == 0x7FFF
5716             && (uint64_t) (extractFloatx80Frac(a) << 1))
5717         || (extractFloatx80Exp(b) == 0x7FFF
5718             && (uint64_t) (extractFloatx80Frac(b) << 1))
5719        ) {
5720         float_raise(float_flag_invalid, status);
5721         return 0;
5722     }
5723     return
5724            ( a.low == b.low )
5725         && (    ( a.high == b.high )
5726              || (    ( a.low == 0 )
5727                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5728            );
5729 
5730 }
5731 
5732 /*----------------------------------------------------------------------------
5733 | Returns 1 if the extended double-precision floating-point value `a' is
5734 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5735 | invalid exception is raised if either operand is a NaN.  The comparison is
5736 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5737 | Arithmetic.
5738 *----------------------------------------------------------------------------*/
5739 
5740 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5741 {
5742     flag aSign, bSign;
5743 
5744     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5745         || (extractFloatx80Exp(a) == 0x7FFF
5746             && (uint64_t) (extractFloatx80Frac(a) << 1))
5747         || (extractFloatx80Exp(b) == 0x7FFF
5748             && (uint64_t) (extractFloatx80Frac(b) << 1))
5749        ) {
5750         float_raise(float_flag_invalid, status);
5751         return 0;
5752     }
5753     aSign = extractFloatx80Sign( a );
5754     bSign = extractFloatx80Sign( b );
5755     if ( aSign != bSign ) {
5756         return
5757                aSign
5758             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5759                  == 0 );
5760     }
5761     return
5762           aSign ? le128( b.high, b.low, a.high, a.low )
5763         : le128( a.high, a.low, b.high, b.low );
5764 
5765 }
5766 
5767 /*----------------------------------------------------------------------------
5768 | Returns 1 if the extended double-precision floating-point value `a' is
5769 | less than the corresponding value `b', and 0 otherwise.  The invalid
5770 | exception is raised if either operand is a NaN.  The comparison is performed
5771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5772 *----------------------------------------------------------------------------*/
5773 
5774 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5775 {
5776     flag aSign, bSign;
5777 
5778     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5779         || (extractFloatx80Exp(a) == 0x7FFF
5780             && (uint64_t) (extractFloatx80Frac(a) << 1))
5781         || (extractFloatx80Exp(b) == 0x7FFF
5782             && (uint64_t) (extractFloatx80Frac(b) << 1))
5783        ) {
5784         float_raise(float_flag_invalid, status);
5785         return 0;
5786     }
5787     aSign = extractFloatx80Sign( a );
5788     bSign = extractFloatx80Sign( b );
5789     if ( aSign != bSign ) {
5790         return
5791                aSign
5792             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5793                  != 0 );
5794     }
5795     return
5796           aSign ? lt128( b.high, b.low, a.high, a.low )
5797         : lt128( a.high, a.low, b.high, b.low );
5798 
5799 }
5800 
5801 /*----------------------------------------------------------------------------
5802 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5803 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5804 | either operand is a NaN.   The comparison is performed according to the
5805 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5806 *----------------------------------------------------------------------------*/
5807 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5808 {
5809     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)
5810         || (extractFloatx80Exp(a) == 0x7FFF
5811             && (uint64_t) (extractFloatx80Frac(a) << 1))
5812         || (extractFloatx80Exp(b) == 0x7FFF
5813             && (uint64_t) (extractFloatx80Frac(b) << 1))
5814        ) {
5815         float_raise(float_flag_invalid, status);
5816         return 1;
5817     }
5818     return 0;
5819 }
5820 
5821 /*----------------------------------------------------------------------------
5822 | Returns 1 if the extended double-precision floating-point value `a' is
5823 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5824 | cause an exception.  The comparison is performed according to the IEC/IEEE
5825 | Standard for Binary Floating-Point Arithmetic.
5826 *----------------------------------------------------------------------------*/
5827 
5828 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5829 {
5830 
5831     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5832         float_raise(float_flag_invalid, status);
5833         return 0;
5834     }
5835     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5836               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5837          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5838               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5839        ) {
5840         if (floatx80_is_signaling_nan(a, status)
5841          || floatx80_is_signaling_nan(b, status)) {
5842             float_raise(float_flag_invalid, status);
5843         }
5844         return 0;
5845     }
5846     return
5847            ( a.low == b.low )
5848         && (    ( a.high == b.high )
5849              || (    ( a.low == 0 )
5850                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5851            );
5852 
5853 }
5854 
5855 /*----------------------------------------------------------------------------
5856 | Returns 1 if the extended double-precision floating-point value `a' is less
5857 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5858 | do not cause an exception.  Otherwise, the comparison is performed according
5859 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5860 *----------------------------------------------------------------------------*/
5861 
5862 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5863 {
5864     flag aSign, bSign;
5865 
5866     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5867         float_raise(float_flag_invalid, status);
5868         return 0;
5869     }
5870     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5871               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5872          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5873               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5874        ) {
5875         if (floatx80_is_signaling_nan(a, status)
5876          || floatx80_is_signaling_nan(b, status)) {
5877             float_raise(float_flag_invalid, status);
5878         }
5879         return 0;
5880     }
5881     aSign = extractFloatx80Sign( a );
5882     bSign = extractFloatx80Sign( b );
5883     if ( aSign != bSign ) {
5884         return
5885                aSign
5886             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5887                  == 0 );
5888     }
5889     return
5890           aSign ? le128( b.high, b.low, a.high, a.low )
5891         : le128( a.high, a.low, b.high, b.low );
5892 
5893 }
5894 
5895 /*----------------------------------------------------------------------------
5896 | Returns 1 if the extended double-precision floating-point value `a' is less
5897 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5898 | an exception.  Otherwise, the comparison is performed according to the
5899 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5900 *----------------------------------------------------------------------------*/
5901 
5902 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5903 {
5904     flag aSign, bSign;
5905 
5906     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5907         float_raise(float_flag_invalid, status);
5908         return 0;
5909     }
5910     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5911               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5912          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5913               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5914        ) {
5915         if (floatx80_is_signaling_nan(a, status)
5916          || floatx80_is_signaling_nan(b, status)) {
5917             float_raise(float_flag_invalid, status);
5918         }
5919         return 0;
5920     }
5921     aSign = extractFloatx80Sign( a );
5922     bSign = extractFloatx80Sign( b );
5923     if ( aSign != bSign ) {
5924         return
5925                aSign
5926             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5927                  != 0 );
5928     }
5929     return
5930           aSign ? lt128( b.high, b.low, a.high, a.low )
5931         : lt128( a.high, a.low, b.high, b.low );
5932 
5933 }
5934 
5935 /*----------------------------------------------------------------------------
5936 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5937 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5938 | The comparison is performed according to the IEC/IEEE Standard for Binary
5939 | Floating-Point Arithmetic.
5940 *----------------------------------------------------------------------------*/
5941 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5942 {
5943     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
5944         float_raise(float_flag_invalid, status);
5945         return 1;
5946     }
5947     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5948               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5949          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5950               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5951        ) {
5952         if (floatx80_is_signaling_nan(a, status)
5953          || floatx80_is_signaling_nan(b, status)) {
5954             float_raise(float_flag_invalid, status);
5955         }
5956         return 1;
5957     }
5958     return 0;
5959 }
5960 
5961 /*----------------------------------------------------------------------------
5962 | Returns the result of converting the quadruple-precision floating-point
5963 | value `a' to the 32-bit two's complement integer format.  The conversion
5964 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5965 | Arithmetic---which means in particular that the conversion is rounded
5966 | according to the current rounding mode.  If `a' is a NaN, the largest
5967 | positive integer is returned.  Otherwise, if the conversion overflows, the
5968 | largest integer with the same sign as `a' is returned.
5969 *----------------------------------------------------------------------------*/
5970 
5971 int32_t float128_to_int32(float128 a, float_status *status)
5972 {
5973     flag aSign;
5974     int32_t aExp, shiftCount;
5975     uint64_t aSig0, aSig1;
5976 
5977     aSig1 = extractFloat128Frac1( a );
5978     aSig0 = extractFloat128Frac0( a );
5979     aExp = extractFloat128Exp( a );
5980     aSign = extractFloat128Sign( a );
5981     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5982     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5983     aSig0 |= ( aSig1 != 0 );
5984     shiftCount = 0x4028 - aExp;
5985     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5986     return roundAndPackInt32(aSign, aSig0, status);
5987 
5988 }
5989 
5990 /*----------------------------------------------------------------------------
5991 | Returns the result of converting the quadruple-precision floating-point
5992 | value `a' to the 32-bit two's complement integer format.  The conversion
5993 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5994 | Arithmetic, except that the conversion is always rounded toward zero.  If
5995 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5996 | conversion overflows, the largest integer with the same sign as `a' is
5997 | returned.
5998 *----------------------------------------------------------------------------*/
5999 
6000 int32_t float128_to_int32_round_to_zero(float128 a, float_status *status)
6001 {
6002     flag aSign;
6003     int32_t aExp, shiftCount;
6004     uint64_t aSig0, aSig1, savedASig;
6005     int32_t z;
6006 
6007     aSig1 = extractFloat128Frac1( a );
6008     aSig0 = extractFloat128Frac0( a );
6009     aExp = extractFloat128Exp( a );
6010     aSign = extractFloat128Sign( a );
6011     aSig0 |= ( aSig1 != 0 );
6012     if ( 0x401E < aExp ) {
6013         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
6014         goto invalid;
6015     }
6016     else if ( aExp < 0x3FFF ) {
6017         if (aExp || aSig0) {
6018             status->float_exception_flags |= float_flag_inexact;
6019         }
6020         return 0;
6021     }
6022     aSig0 |= LIT64( 0x0001000000000000 );
6023     shiftCount = 0x402F - aExp;
6024     savedASig = aSig0;
6025     aSig0 >>= shiftCount;
6026     z = aSig0;
6027     if ( aSign ) z = - z;
6028     if ( ( z < 0 ) ^ aSign ) {
6029  invalid:
6030         float_raise(float_flag_invalid, status);
6031         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
6032     }
6033     if ( ( aSig0<<shiftCount ) != savedASig ) {
6034         status->float_exception_flags |= float_flag_inexact;
6035     }
6036     return z;
6037 
6038 }
6039 
6040 /*----------------------------------------------------------------------------
6041 | Returns the result of converting the quadruple-precision floating-point
6042 | value `a' to the 64-bit two's complement integer format.  The conversion
6043 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6044 | Arithmetic---which means in particular that the conversion is rounded
6045 | according to the current rounding mode.  If `a' is a NaN, the largest
6046 | positive integer is returned.  Otherwise, if the conversion overflows, the
6047 | largest integer with the same sign as `a' is returned.
6048 *----------------------------------------------------------------------------*/
6049 
6050 int64_t float128_to_int64(float128 a, float_status *status)
6051 {
6052     flag aSign;
6053     int32_t aExp, shiftCount;
6054     uint64_t aSig0, aSig1;
6055 
6056     aSig1 = extractFloat128Frac1( a );
6057     aSig0 = extractFloat128Frac0( a );
6058     aExp = extractFloat128Exp( a );
6059     aSign = extractFloat128Sign( a );
6060     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6061     shiftCount = 0x402F - aExp;
6062     if ( shiftCount <= 0 ) {
6063         if ( 0x403E < aExp ) {
6064             float_raise(float_flag_invalid, status);
6065             if (    ! aSign
6066                  || (    ( aExp == 0x7FFF )
6067                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
6068                     )
6069                ) {
6070                 return LIT64( 0x7FFFFFFFFFFFFFFF );
6071             }
6072             return (int64_t) LIT64( 0x8000000000000000 );
6073         }
6074         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
6075     }
6076     else {
6077         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
6078     }
6079     return roundAndPackInt64(aSign, aSig0, aSig1, status);
6080 
6081 }
6082 
6083 /*----------------------------------------------------------------------------
6084 | Returns the result of converting the quadruple-precision floating-point
6085 | value `a' to the 64-bit two's complement integer format.  The conversion
6086 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6087 | Arithmetic, except that the conversion is always rounded toward zero.
6088 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
6089 | the conversion overflows, the largest integer with the same sign as `a' is
6090 | returned.
6091 *----------------------------------------------------------------------------*/
6092 
6093 int64_t float128_to_int64_round_to_zero(float128 a, float_status *status)
6094 {
6095     flag aSign;
6096     int32_t aExp, shiftCount;
6097     uint64_t aSig0, aSig1;
6098     int64_t z;
6099 
6100     aSig1 = extractFloat128Frac1( a );
6101     aSig0 = extractFloat128Frac0( a );
6102     aExp = extractFloat128Exp( a );
6103     aSign = extractFloat128Sign( a );
6104     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
6105     shiftCount = aExp - 0x402F;
6106     if ( 0 < shiftCount ) {
6107         if ( 0x403E <= aExp ) {
6108             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
6109             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
6110                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
6111                 if (aSig1) {
6112                     status->float_exception_flags |= float_flag_inexact;
6113                 }
6114             }
6115             else {
6116                 float_raise(float_flag_invalid, status);
6117                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6118                     return LIT64( 0x7FFFFFFFFFFFFFFF );
6119                 }
6120             }
6121             return (int64_t) LIT64( 0x8000000000000000 );
6122         }
6123         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6124         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6125             status->float_exception_flags |= float_flag_inexact;
6126         }
6127     }
6128     else {
6129         if ( aExp < 0x3FFF ) {
6130             if ( aExp | aSig0 | aSig1 ) {
6131                 status->float_exception_flags |= float_flag_inexact;
6132             }
6133             return 0;
6134         }
6135         z = aSig0>>( - shiftCount );
6136         if (    aSig1
6137              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6138             status->float_exception_flags |= float_flag_inexact;
6139         }
6140     }
6141     if ( aSign ) z = - z;
6142     return z;
6143 
6144 }
6145 
6146 /*----------------------------------------------------------------------------
6147 | Returns the result of converting the quadruple-precision floating-point value
6148 | `a' to the 64-bit unsigned integer format.  The conversion is
6149 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6150 | Arithmetic---which means in particular that the conversion is rounded
6151 | according to the current rounding mode.  If `a' is a NaN, the largest
6152 | positive integer is returned.  If the conversion overflows, the
6153 | largest unsigned integer is returned.  If 'a' is negative, the value is
6154 | rounded and zero is returned; negative values that do not round to zero
6155 | will raise the inexact exception.
6156 *----------------------------------------------------------------------------*/
6157 
6158 uint64_t float128_to_uint64(float128 a, float_status *status)
6159 {
6160     flag aSign;
6161     int aExp;
6162     int shiftCount;
6163     uint64_t aSig0, aSig1;
6164 
6165     aSig0 = extractFloat128Frac0(a);
6166     aSig1 = extractFloat128Frac1(a);
6167     aExp = extractFloat128Exp(a);
6168     aSign = extractFloat128Sign(a);
6169     if (aSign && (aExp > 0x3FFE)) {
6170         float_raise(float_flag_invalid, status);
6171         if (float128_is_any_nan(a)) {
6172             return LIT64(0xFFFFFFFFFFFFFFFF);
6173         } else {
6174             return 0;
6175         }
6176     }
6177     if (aExp) {
6178         aSig0 |= LIT64(0x0001000000000000);
6179     }
6180     shiftCount = 0x402F - aExp;
6181     if (shiftCount <= 0) {
6182         if (0x403E < aExp) {
6183             float_raise(float_flag_invalid, status);
6184             return LIT64(0xFFFFFFFFFFFFFFFF);
6185         }
6186         shortShift128Left(aSig0, aSig1, -shiftCount, &aSig0, &aSig1);
6187     } else {
6188         shift64ExtraRightJamming(aSig0, aSig1, shiftCount, &aSig0, &aSig1);
6189     }
6190     return roundAndPackUint64(aSign, aSig0, aSig1, status);
6191 }
6192 
6193 uint64_t float128_to_uint64_round_to_zero(float128 a, float_status *status)
6194 {
6195     uint64_t v;
6196     signed char current_rounding_mode = status->float_rounding_mode;
6197 
6198     set_float_rounding_mode(float_round_to_zero, status);
6199     v = float128_to_uint64(a, status);
6200     set_float_rounding_mode(current_rounding_mode, status);
6201 
6202     return v;
6203 }
6204 
6205 /*----------------------------------------------------------------------------
6206 | Returns the result of converting the quadruple-precision floating-point
6207 | value `a' to the 32-bit unsigned integer format.  The conversion
6208 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6209 | Arithmetic except that the conversion is always rounded toward zero.
6210 | If `a' is a NaN, the largest positive integer is returned.  Otherwise,
6211 | if the conversion overflows, the largest unsigned integer is returned.
6212 | If 'a' is negative, the value is rounded and zero is returned; negative
6213 | values that do not round to zero will raise the inexact exception.
6214 *----------------------------------------------------------------------------*/
6215 
6216 uint32_t float128_to_uint32_round_to_zero(float128 a, float_status *status)
6217 {
6218     uint64_t v;
6219     uint32_t res;
6220     int old_exc_flags = get_float_exception_flags(status);
6221 
6222     v = float128_to_uint64_round_to_zero(a, status);
6223     if (v > 0xffffffff) {
6224         res = 0xffffffff;
6225     } else {
6226         return v;
6227     }
6228     set_float_exception_flags(old_exc_flags, status);
6229     float_raise(float_flag_invalid, status);
6230     return res;
6231 }
6232 
6233 /*----------------------------------------------------------------------------
6234 | Returns the result of converting the quadruple-precision floating-point
6235 | value `a' to the single-precision floating-point format.  The conversion
6236 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6237 | Arithmetic.
6238 *----------------------------------------------------------------------------*/
6239 
6240 float32 float128_to_float32(float128 a, float_status *status)
6241 {
6242     flag aSign;
6243     int32_t aExp;
6244     uint64_t aSig0, aSig1;
6245     uint32_t zSig;
6246 
6247     aSig1 = extractFloat128Frac1( a );
6248     aSig0 = extractFloat128Frac0( a );
6249     aExp = extractFloat128Exp( a );
6250     aSign = extractFloat128Sign( a );
6251     if ( aExp == 0x7FFF ) {
6252         if ( aSig0 | aSig1 ) {
6253             return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6254         }
6255         return packFloat32( aSign, 0xFF, 0 );
6256     }
6257     aSig0 |= ( aSig1 != 0 );
6258     shift64RightJamming( aSig0, 18, &aSig0 );
6259     zSig = aSig0;
6260     if ( aExp || zSig ) {
6261         zSig |= 0x40000000;
6262         aExp -= 0x3F81;
6263     }
6264     return roundAndPackFloat32(aSign, aExp, zSig, status);
6265 
6266 }
6267 
6268 /*----------------------------------------------------------------------------
6269 | Returns the result of converting the quadruple-precision floating-point
6270 | value `a' to the double-precision floating-point format.  The conversion
6271 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6272 | Arithmetic.
6273 *----------------------------------------------------------------------------*/
6274 
6275 float64 float128_to_float64(float128 a, float_status *status)
6276 {
6277     flag aSign;
6278     int32_t aExp;
6279     uint64_t aSig0, aSig1;
6280 
6281     aSig1 = extractFloat128Frac1( a );
6282     aSig0 = extractFloat128Frac0( a );
6283     aExp = extractFloat128Exp( a );
6284     aSign = extractFloat128Sign( a );
6285     if ( aExp == 0x7FFF ) {
6286         if ( aSig0 | aSig1 ) {
6287             return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6288         }
6289         return packFloat64( aSign, 0x7FF, 0 );
6290     }
6291     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6292     aSig0 |= ( aSig1 != 0 );
6293     if ( aExp || aSig0 ) {
6294         aSig0 |= LIT64( 0x4000000000000000 );
6295         aExp -= 0x3C01;
6296     }
6297     return roundAndPackFloat64(aSign, aExp, aSig0, status);
6298 
6299 }
6300 
6301 /*----------------------------------------------------------------------------
6302 | Returns the result of converting the quadruple-precision floating-point
6303 | value `a' to the extended double-precision floating-point format.  The
6304 | conversion is performed according to the IEC/IEEE Standard for Binary
6305 | Floating-Point Arithmetic.
6306 *----------------------------------------------------------------------------*/
6307 
6308 floatx80 float128_to_floatx80(float128 a, float_status *status)
6309 {
6310     flag aSign;
6311     int32_t aExp;
6312     uint64_t aSig0, aSig1;
6313 
6314     aSig1 = extractFloat128Frac1( a );
6315     aSig0 = extractFloat128Frac0( a );
6316     aExp = extractFloat128Exp( a );
6317     aSign = extractFloat128Sign( a );
6318     if ( aExp == 0x7FFF ) {
6319         if ( aSig0 | aSig1 ) {
6320             return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6321         }
6322         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6323     }
6324     if ( aExp == 0 ) {
6325         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6326         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6327     }
6328     else {
6329         aSig0 |= LIT64( 0x0001000000000000 );
6330     }
6331     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6332     return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6333 
6334 }
6335 
6336 /*----------------------------------------------------------------------------
6337 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6338 | returns the result as a quadruple-precision floating-point value.  The
6339 | operation is performed according to the IEC/IEEE Standard for Binary
6340 | Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6342 
6343 float128 float128_round_to_int(float128 a, float_status *status)
6344 {
6345     flag aSign;
6346     int32_t aExp;
6347     uint64_t lastBitMask, roundBitsMask;
6348     float128 z;
6349 
6350     aExp = extractFloat128Exp( a );
6351     if ( 0x402F <= aExp ) {
6352         if ( 0x406F <= aExp ) {
6353             if (    ( aExp == 0x7FFF )
6354                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6355                ) {
6356                 return propagateFloat128NaN(a, a, status);
6357             }
6358             return a;
6359         }
6360         lastBitMask = 1;
6361         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6362         roundBitsMask = lastBitMask - 1;
6363         z = a;
6364         switch (status->float_rounding_mode) {
6365         case float_round_nearest_even:
6366             if ( lastBitMask ) {
6367                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6368                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6369             }
6370             else {
6371                 if ( (int64_t) z.low < 0 ) {
6372                     ++z.high;
6373                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6374                 }
6375             }
6376             break;
6377         case float_round_ties_away:
6378             if (lastBitMask) {
6379                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6380             } else {
6381                 if ((int64_t) z.low < 0) {
6382                     ++z.high;
6383                 }
6384             }
6385             break;
6386         case float_round_to_zero:
6387             break;
6388         case float_round_up:
6389             if (!extractFloat128Sign(z)) {
6390                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6391             }
6392             break;
6393         case float_round_down:
6394             if (extractFloat128Sign(z)) {
6395                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6396             }
6397             break;
6398         default:
6399             abort();
6400         }
6401         z.low &= ~ roundBitsMask;
6402     }
6403     else {
6404         if ( aExp < 0x3FFF ) {
6405             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6406             status->float_exception_flags |= float_flag_inexact;
6407             aSign = extractFloat128Sign( a );
6408             switch (status->float_rounding_mode) {
6409              case float_round_nearest_even:
6410                 if (    ( aExp == 0x3FFE )
6411                      && (   extractFloat128Frac0( a )
6412                           | extractFloat128Frac1( a ) )
6413                    ) {
6414                     return packFloat128( aSign, 0x3FFF, 0, 0 );
6415                 }
6416                 break;
6417             case float_round_ties_away:
6418                 if (aExp == 0x3FFE) {
6419                     return packFloat128(aSign, 0x3FFF, 0, 0);
6420                 }
6421                 break;
6422              case float_round_down:
6423                 return
6424                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6425                     : packFloat128( 0, 0, 0, 0 );
6426              case float_round_up:
6427                 return
6428                       aSign ? packFloat128( 1, 0, 0, 0 )
6429                     : packFloat128( 0, 0x3FFF, 0, 0 );
6430             }
6431             return packFloat128( aSign, 0, 0, 0 );
6432         }
6433         lastBitMask = 1;
6434         lastBitMask <<= 0x402F - aExp;
6435         roundBitsMask = lastBitMask - 1;
6436         z.low = 0;
6437         z.high = a.high;
6438         switch (status->float_rounding_mode) {
6439         case float_round_nearest_even:
6440             z.high += lastBitMask>>1;
6441             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6442                 z.high &= ~ lastBitMask;
6443             }
6444             break;
6445         case float_round_ties_away:
6446             z.high += lastBitMask>>1;
6447             break;
6448         case float_round_to_zero:
6449             break;
6450         case float_round_up:
6451             if (!extractFloat128Sign(z)) {
6452                 z.high |= ( a.low != 0 );
6453                 z.high += roundBitsMask;
6454             }
6455             break;
6456         case float_round_down:
6457             if (extractFloat128Sign(z)) {
6458                 z.high |= (a.low != 0);
6459                 z.high += roundBitsMask;
6460             }
6461             break;
6462         default:
6463             abort();
6464         }
6465         z.high &= ~ roundBitsMask;
6466     }
6467     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6468         status->float_exception_flags |= float_flag_inexact;
6469     }
6470     return z;
6471 
6472 }
6473 
6474 /*----------------------------------------------------------------------------
6475 | Returns the result of adding the absolute values of the quadruple-precision
6476 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6477 | before being returned.  `zSign' is ignored if the result is a NaN.
6478 | The addition is performed according to the IEC/IEEE Standard for Binary
6479 | Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6481 
6482 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6483                                 float_status *status)
6484 {
6485     int32_t aExp, bExp, zExp;
6486     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6487     int32_t expDiff;
6488 
6489     aSig1 = extractFloat128Frac1( a );
6490     aSig0 = extractFloat128Frac0( a );
6491     aExp = extractFloat128Exp( a );
6492     bSig1 = extractFloat128Frac1( b );
6493     bSig0 = extractFloat128Frac0( b );
6494     bExp = extractFloat128Exp( b );
6495     expDiff = aExp - bExp;
6496     if ( 0 < expDiff ) {
6497         if ( aExp == 0x7FFF ) {
6498             if (aSig0 | aSig1) {
6499                 return propagateFloat128NaN(a, b, status);
6500             }
6501             return a;
6502         }
6503         if ( bExp == 0 ) {
6504             --expDiff;
6505         }
6506         else {
6507             bSig0 |= LIT64( 0x0001000000000000 );
6508         }
6509         shift128ExtraRightJamming(
6510             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6511         zExp = aExp;
6512     }
6513     else if ( expDiff < 0 ) {
6514         if ( bExp == 0x7FFF ) {
6515             if (bSig0 | bSig1) {
6516                 return propagateFloat128NaN(a, b, status);
6517             }
6518             return packFloat128( zSign, 0x7FFF, 0, 0 );
6519         }
6520         if ( aExp == 0 ) {
6521             ++expDiff;
6522         }
6523         else {
6524             aSig0 |= LIT64( 0x0001000000000000 );
6525         }
6526         shift128ExtraRightJamming(
6527             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6528         zExp = bExp;
6529     }
6530     else {
6531         if ( aExp == 0x7FFF ) {
6532             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6533                 return propagateFloat128NaN(a, b, status);
6534             }
6535             return a;
6536         }
6537         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6538         if ( aExp == 0 ) {
6539             if (status->flush_to_zero) {
6540                 if (zSig0 | zSig1) {
6541                     float_raise(float_flag_output_denormal, status);
6542                 }
6543                 return packFloat128(zSign, 0, 0, 0);
6544             }
6545             return packFloat128( zSign, 0, zSig0, zSig1 );
6546         }
6547         zSig2 = 0;
6548         zSig0 |= LIT64( 0x0002000000000000 );
6549         zExp = aExp;
6550         goto shiftRight1;
6551     }
6552     aSig0 |= LIT64( 0x0001000000000000 );
6553     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6554     --zExp;
6555     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6556     ++zExp;
6557  shiftRight1:
6558     shift128ExtraRightJamming(
6559         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6560  roundAndPack:
6561     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6562 
6563 }
6564 
6565 /*----------------------------------------------------------------------------
6566 | Returns the result of subtracting the absolute values of the quadruple-
6567 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6568 | difference is negated before being returned.  `zSign' is ignored if the
6569 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6570 | Standard for Binary Floating-Point Arithmetic.
6571 *----------------------------------------------------------------------------*/
6572 
6573 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6574                                 float_status *status)
6575 {
6576     int32_t aExp, bExp, zExp;
6577     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6578     int32_t expDiff;
6579 
6580     aSig1 = extractFloat128Frac1( a );
6581     aSig0 = extractFloat128Frac0( a );
6582     aExp = extractFloat128Exp( a );
6583     bSig1 = extractFloat128Frac1( b );
6584     bSig0 = extractFloat128Frac0( b );
6585     bExp = extractFloat128Exp( b );
6586     expDiff = aExp - bExp;
6587     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6588     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6589     if ( 0 < expDiff ) goto aExpBigger;
6590     if ( expDiff < 0 ) goto bExpBigger;
6591     if ( aExp == 0x7FFF ) {
6592         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6593             return propagateFloat128NaN(a, b, status);
6594         }
6595         float_raise(float_flag_invalid, status);
6596         return float128_default_nan(status);
6597     }
6598     if ( aExp == 0 ) {
6599         aExp = 1;
6600         bExp = 1;
6601     }
6602     if ( bSig0 < aSig0 ) goto aBigger;
6603     if ( aSig0 < bSig0 ) goto bBigger;
6604     if ( bSig1 < aSig1 ) goto aBigger;
6605     if ( aSig1 < bSig1 ) goto bBigger;
6606     return packFloat128(status->float_rounding_mode == float_round_down,
6607                         0, 0, 0);
6608  bExpBigger:
6609     if ( bExp == 0x7FFF ) {
6610         if (bSig0 | bSig1) {
6611             return propagateFloat128NaN(a, b, status);
6612         }
6613         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6614     }
6615     if ( aExp == 0 ) {
6616         ++expDiff;
6617     }
6618     else {
6619         aSig0 |= LIT64( 0x4000000000000000 );
6620     }
6621     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6622     bSig0 |= LIT64( 0x4000000000000000 );
6623  bBigger:
6624     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6625     zExp = bExp;
6626     zSign ^= 1;
6627     goto normalizeRoundAndPack;
6628  aExpBigger:
6629     if ( aExp == 0x7FFF ) {
6630         if (aSig0 | aSig1) {
6631             return propagateFloat128NaN(a, b, status);
6632         }
6633         return a;
6634     }
6635     if ( bExp == 0 ) {
6636         --expDiff;
6637     }
6638     else {
6639         bSig0 |= LIT64( 0x4000000000000000 );
6640     }
6641     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6642     aSig0 |= LIT64( 0x4000000000000000 );
6643  aBigger:
6644     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6645     zExp = aExp;
6646  normalizeRoundAndPack:
6647     --zExp;
6648     return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6649                                          status);
6650 
6651 }
6652 
6653 /*----------------------------------------------------------------------------
6654 | Returns the result of adding the quadruple-precision floating-point values
6655 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6656 | for Binary Floating-Point Arithmetic.
6657 *----------------------------------------------------------------------------*/
6658 
6659 float128 float128_add(float128 a, float128 b, float_status *status)
6660 {
6661     flag aSign, bSign;
6662 
6663     aSign = extractFloat128Sign( a );
6664     bSign = extractFloat128Sign( b );
6665     if ( aSign == bSign ) {
6666         return addFloat128Sigs(a, b, aSign, status);
6667     }
6668     else {
6669         return subFloat128Sigs(a, b, aSign, status);
6670     }
6671 
6672 }
6673 
6674 /*----------------------------------------------------------------------------
6675 | Returns the result of subtracting the quadruple-precision floating-point
6676 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6677 | Standard for Binary Floating-Point Arithmetic.
6678 *----------------------------------------------------------------------------*/
6679 
6680 float128 float128_sub(float128 a, float128 b, float_status *status)
6681 {
6682     flag aSign, bSign;
6683 
6684     aSign = extractFloat128Sign( a );
6685     bSign = extractFloat128Sign( b );
6686     if ( aSign == bSign ) {
6687         return subFloat128Sigs(a, b, aSign, status);
6688     }
6689     else {
6690         return addFloat128Sigs(a, b, aSign, status);
6691     }
6692 
6693 }
6694 
6695 /*----------------------------------------------------------------------------
6696 | Returns the result of multiplying the quadruple-precision floating-point
6697 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6698 | Standard for Binary Floating-Point Arithmetic.
6699 *----------------------------------------------------------------------------*/
6700 
6701 float128 float128_mul(float128 a, float128 b, float_status *status)
6702 {
6703     flag aSign, bSign, zSign;
6704     int32_t aExp, bExp, zExp;
6705     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6706 
6707     aSig1 = extractFloat128Frac1( a );
6708     aSig0 = extractFloat128Frac0( a );
6709     aExp = extractFloat128Exp( a );
6710     aSign = extractFloat128Sign( a );
6711     bSig1 = extractFloat128Frac1( b );
6712     bSig0 = extractFloat128Frac0( b );
6713     bExp = extractFloat128Exp( b );
6714     bSign = extractFloat128Sign( b );
6715     zSign = aSign ^ bSign;
6716     if ( aExp == 0x7FFF ) {
6717         if (    ( aSig0 | aSig1 )
6718              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6719             return propagateFloat128NaN(a, b, status);
6720         }
6721         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6722         return packFloat128( zSign, 0x7FFF, 0, 0 );
6723     }
6724     if ( bExp == 0x7FFF ) {
6725         if (bSig0 | bSig1) {
6726             return propagateFloat128NaN(a, b, status);
6727         }
6728         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6729  invalid:
6730             float_raise(float_flag_invalid, status);
6731             return float128_default_nan(status);
6732         }
6733         return packFloat128( zSign, 0x7FFF, 0, 0 );
6734     }
6735     if ( aExp == 0 ) {
6736         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6737         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6738     }
6739     if ( bExp == 0 ) {
6740         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6741         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6742     }
6743     zExp = aExp + bExp - 0x4000;
6744     aSig0 |= LIT64( 0x0001000000000000 );
6745     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6746     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6747     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6748     zSig2 |= ( zSig3 != 0 );
6749     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6750         shift128ExtraRightJamming(
6751             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6752         ++zExp;
6753     }
6754     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6755 
6756 }
6757 
6758 /*----------------------------------------------------------------------------
6759 | Returns the result of dividing the quadruple-precision floating-point value
6760 | `a' by the corresponding value `b'.  The operation is performed according to
6761 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6762 *----------------------------------------------------------------------------*/
6763 
6764 float128 float128_div(float128 a, float128 b, float_status *status)
6765 {
6766     flag aSign, bSign, zSign;
6767     int32_t aExp, bExp, zExp;
6768     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6769     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6770 
6771     aSig1 = extractFloat128Frac1( a );
6772     aSig0 = extractFloat128Frac0( a );
6773     aExp = extractFloat128Exp( a );
6774     aSign = extractFloat128Sign( a );
6775     bSig1 = extractFloat128Frac1( b );
6776     bSig0 = extractFloat128Frac0( b );
6777     bExp = extractFloat128Exp( b );
6778     bSign = extractFloat128Sign( b );
6779     zSign = aSign ^ bSign;
6780     if ( aExp == 0x7FFF ) {
6781         if (aSig0 | aSig1) {
6782             return propagateFloat128NaN(a, b, status);
6783         }
6784         if ( bExp == 0x7FFF ) {
6785             if (bSig0 | bSig1) {
6786                 return propagateFloat128NaN(a, b, status);
6787             }
6788             goto invalid;
6789         }
6790         return packFloat128( zSign, 0x7FFF, 0, 0 );
6791     }
6792     if ( bExp == 0x7FFF ) {
6793         if (bSig0 | bSig1) {
6794             return propagateFloat128NaN(a, b, status);
6795         }
6796         return packFloat128( zSign, 0, 0, 0 );
6797     }
6798     if ( bExp == 0 ) {
6799         if ( ( bSig0 | bSig1 ) == 0 ) {
6800             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6801  invalid:
6802                 float_raise(float_flag_invalid, status);
6803                 return float128_default_nan(status);
6804             }
6805             float_raise(float_flag_divbyzero, status);
6806             return packFloat128( zSign, 0x7FFF, 0, 0 );
6807         }
6808         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6809     }
6810     if ( aExp == 0 ) {
6811         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6812         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6813     }
6814     zExp = aExp - bExp + 0x3FFD;
6815     shortShift128Left(
6816         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6817     shortShift128Left(
6818         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6819     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6820         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6821         ++zExp;
6822     }
6823     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6824     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6825     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6826     while ( (int64_t) rem0 < 0 ) {
6827         --zSig0;
6828         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6829     }
6830     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6831     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6832         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6833         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6834         while ( (int64_t) rem1 < 0 ) {
6835             --zSig1;
6836             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6837         }
6838         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6839     }
6840     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6841     return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6842 
6843 }
6844 
6845 /*----------------------------------------------------------------------------
6846 | Returns the remainder of the quadruple-precision floating-point value `a'
6847 | with respect to the corresponding value `b'.  The operation is performed
6848 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6849 *----------------------------------------------------------------------------*/
6850 
6851 float128 float128_rem(float128 a, float128 b, float_status *status)
6852 {
6853     flag aSign, zSign;
6854     int32_t aExp, bExp, expDiff;
6855     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6856     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6857     int64_t sigMean0;
6858 
6859     aSig1 = extractFloat128Frac1( a );
6860     aSig0 = extractFloat128Frac0( a );
6861     aExp = extractFloat128Exp( a );
6862     aSign = extractFloat128Sign( a );
6863     bSig1 = extractFloat128Frac1( b );
6864     bSig0 = extractFloat128Frac0( b );
6865     bExp = extractFloat128Exp( b );
6866     if ( aExp == 0x7FFF ) {
6867         if (    ( aSig0 | aSig1 )
6868              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6869             return propagateFloat128NaN(a, b, status);
6870         }
6871         goto invalid;
6872     }
6873     if ( bExp == 0x7FFF ) {
6874         if (bSig0 | bSig1) {
6875             return propagateFloat128NaN(a, b, status);
6876         }
6877         return a;
6878     }
6879     if ( bExp == 0 ) {
6880         if ( ( bSig0 | bSig1 ) == 0 ) {
6881  invalid:
6882             float_raise(float_flag_invalid, status);
6883             return float128_default_nan(status);
6884         }
6885         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6886     }
6887     if ( aExp == 0 ) {
6888         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6889         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6890     }
6891     expDiff = aExp - bExp;
6892     if ( expDiff < -1 ) return a;
6893     shortShift128Left(
6894         aSig0 | LIT64( 0x0001000000000000 ),
6895         aSig1,
6896         15 - ( expDiff < 0 ),
6897         &aSig0,
6898         &aSig1
6899     );
6900     shortShift128Left(
6901         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6902     q = le128( bSig0, bSig1, aSig0, aSig1 );
6903     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6904     expDiff -= 64;
6905     while ( 0 < expDiff ) {
6906         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6907         q = ( 4 < q ) ? q - 4 : 0;
6908         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6909         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6910         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6911         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6912         expDiff -= 61;
6913     }
6914     if ( -64 < expDiff ) {
6915         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6916         q = ( 4 < q ) ? q - 4 : 0;
6917         q >>= - expDiff;
6918         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6919         expDiff += 52;
6920         if ( expDiff < 0 ) {
6921             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6922         }
6923         else {
6924             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6925         }
6926         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6927         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6928     }
6929     else {
6930         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6931         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6932     }
6933     do {
6934         alternateASig0 = aSig0;
6935         alternateASig1 = aSig1;
6936         ++q;
6937         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6938     } while ( 0 <= (int64_t) aSig0 );
6939     add128(
6940         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6941     if (    ( sigMean0 < 0 )
6942          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6943         aSig0 = alternateASig0;
6944         aSig1 = alternateASig1;
6945     }
6946     zSign = ( (int64_t) aSig0 < 0 );
6947     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6948     return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6949                                          status);
6950 }
6951 
6952 /*----------------------------------------------------------------------------
6953 | Returns the square root of the quadruple-precision floating-point value `a'.
6954 | The operation is performed according to the IEC/IEEE Standard for Binary
6955 | Floating-Point Arithmetic.
6956 *----------------------------------------------------------------------------*/
6957 
6958 float128 float128_sqrt(float128 a, float_status *status)
6959 {
6960     flag aSign;
6961     int32_t aExp, zExp;
6962     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6963     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6964 
6965     aSig1 = extractFloat128Frac1( a );
6966     aSig0 = extractFloat128Frac0( a );
6967     aExp = extractFloat128Exp( a );
6968     aSign = extractFloat128Sign( a );
6969     if ( aExp == 0x7FFF ) {
6970         if (aSig0 | aSig1) {
6971             return propagateFloat128NaN(a, a, status);
6972         }
6973         if ( ! aSign ) return a;
6974         goto invalid;
6975     }
6976     if ( aSign ) {
6977         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6978  invalid:
6979         float_raise(float_flag_invalid, status);
6980         return float128_default_nan(status);
6981     }
6982     if ( aExp == 0 ) {
6983         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6984         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6985     }
6986     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6987     aSig0 |= LIT64( 0x0001000000000000 );
6988     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6989     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6990     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6991     doubleZSig0 = zSig0<<1;
6992     mul64To128( zSig0, zSig0, &term0, &term1 );
6993     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6994     while ( (int64_t) rem0 < 0 ) {
6995         --zSig0;
6996         doubleZSig0 -= 2;
6997         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6998     }
6999     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
7000     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
7001         if ( zSig1 == 0 ) zSig1 = 1;
7002         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
7003         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
7004         mul64To128( zSig1, zSig1, &term2, &term3 );
7005         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
7006         while ( (int64_t) rem1 < 0 ) {
7007             --zSig1;
7008             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
7009             term3 |= 1;
7010             term2 |= doubleZSig0;
7011             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
7012         }
7013         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
7014     }
7015     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
7016     return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
7017 
7018 }
7019 
7020 /*----------------------------------------------------------------------------
7021 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7022 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7023 | raised if either operand is a NaN.  Otherwise, the comparison is performed
7024 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7025 *----------------------------------------------------------------------------*/
7026 
7027 int float128_eq(float128 a, float128 b, float_status *status)
7028 {
7029 
7030     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7031               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7032          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7033               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7034        ) {
7035         float_raise(float_flag_invalid, status);
7036         return 0;
7037     }
7038     return
7039            ( a.low == b.low )
7040         && (    ( a.high == b.high )
7041              || (    ( a.low == 0 )
7042                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7043            );
7044 
7045 }
7046 
7047 /*----------------------------------------------------------------------------
7048 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7049 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
7050 | exception is raised if either operand is a NaN.  The comparison is performed
7051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7052 *----------------------------------------------------------------------------*/
7053 
7054 int float128_le(float128 a, float128 b, float_status *status)
7055 {
7056     flag aSign, bSign;
7057 
7058     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7059               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7061               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7062        ) {
7063         float_raise(float_flag_invalid, status);
7064         return 0;
7065     }
7066     aSign = extractFloat128Sign( a );
7067     bSign = extractFloat128Sign( b );
7068     if ( aSign != bSign ) {
7069         return
7070                aSign
7071             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7072                  == 0 );
7073     }
7074     return
7075           aSign ? le128( b.high, b.low, a.high, a.low )
7076         : le128( a.high, a.low, b.high, b.low );
7077 
7078 }
7079 
7080 /*----------------------------------------------------------------------------
7081 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7082 | the corresponding value `b', and 0 otherwise.  The invalid exception is
7083 | raised if either operand is a NaN.  The comparison is performed according
7084 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7085 *----------------------------------------------------------------------------*/
7086 
7087 int float128_lt(float128 a, float128 b, float_status *status)
7088 {
7089     flag aSign, bSign;
7090 
7091     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7092               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7093          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7094               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7095        ) {
7096         float_raise(float_flag_invalid, status);
7097         return 0;
7098     }
7099     aSign = extractFloat128Sign( a );
7100     bSign = extractFloat128Sign( b );
7101     if ( aSign != bSign ) {
7102         return
7103                aSign
7104             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7105                  != 0 );
7106     }
7107     return
7108           aSign ? lt128( b.high, b.low, a.high, a.low )
7109         : lt128( a.high, a.low, b.high, b.low );
7110 
7111 }
7112 
7113 /*----------------------------------------------------------------------------
7114 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7115 | be compared, and 0 otherwise.  The invalid exception is raised if either
7116 | operand is a NaN. The comparison is performed according to the IEC/IEEE
7117 | Standard for Binary Floating-Point Arithmetic.
7118 *----------------------------------------------------------------------------*/
7119 
7120 int float128_unordered(float128 a, float128 b, float_status *status)
7121 {
7122     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7123               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7124          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7125               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7126        ) {
7127         float_raise(float_flag_invalid, status);
7128         return 1;
7129     }
7130     return 0;
7131 }
7132 
7133 /*----------------------------------------------------------------------------
7134 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
7135 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7136 | exception.  The comparison is performed according to the IEC/IEEE Standard
7137 | for Binary Floating-Point Arithmetic.
7138 *----------------------------------------------------------------------------*/
7139 
7140 int float128_eq_quiet(float128 a, float128 b, float_status *status)
7141 {
7142 
7143     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7144               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7145          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7146               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7147        ) {
7148         if (float128_is_signaling_nan(a, status)
7149          || float128_is_signaling_nan(b, status)) {
7150             float_raise(float_flag_invalid, status);
7151         }
7152         return 0;
7153     }
7154     return
7155            ( a.low == b.low )
7156         && (    ( a.high == b.high )
7157              || (    ( a.low == 0 )
7158                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
7159            );
7160 
7161 }
7162 
7163 /*----------------------------------------------------------------------------
7164 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7165 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
7166 | cause an exception.  Otherwise, the comparison is performed according to the
7167 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
7168 *----------------------------------------------------------------------------*/
7169 
7170 int float128_le_quiet(float128 a, float128 b, float_status *status)
7171 {
7172     flag aSign, bSign;
7173 
7174     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7175               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7176          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7177               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7178        ) {
7179         if (float128_is_signaling_nan(a, status)
7180          || float128_is_signaling_nan(b, status)) {
7181             float_raise(float_flag_invalid, status);
7182         }
7183         return 0;
7184     }
7185     aSign = extractFloat128Sign( a );
7186     bSign = extractFloat128Sign( b );
7187     if ( aSign != bSign ) {
7188         return
7189                aSign
7190             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7191                  == 0 );
7192     }
7193     return
7194           aSign ? le128( b.high, b.low, a.high, a.low )
7195         : le128( a.high, a.low, b.high, b.low );
7196 
7197 }
7198 
7199 /*----------------------------------------------------------------------------
7200 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7201 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
7202 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
7203 | Standard for Binary Floating-Point Arithmetic.
7204 *----------------------------------------------------------------------------*/
7205 
7206 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7207 {
7208     flag aSign, bSign;
7209 
7210     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7211               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7212          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7213               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7214        ) {
7215         if (float128_is_signaling_nan(a, status)
7216          || float128_is_signaling_nan(b, status)) {
7217             float_raise(float_flag_invalid, status);
7218         }
7219         return 0;
7220     }
7221     aSign = extractFloat128Sign( a );
7222     bSign = extractFloat128Sign( b );
7223     if ( aSign != bSign ) {
7224         return
7225                aSign
7226             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7227                  != 0 );
7228     }
7229     return
7230           aSign ? lt128( b.high, b.low, a.high, a.low )
7231         : lt128( a.high, a.low, b.high, b.low );
7232 
7233 }
7234 
7235 /*----------------------------------------------------------------------------
7236 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7237 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
7238 | comparison is performed according to the IEC/IEEE Standard for Binary
7239 | Floating-Point Arithmetic.
7240 *----------------------------------------------------------------------------*/
7241 
7242 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7243 {
7244     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
7245               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7246          || (    ( extractFloat128Exp( b ) == 0x7FFF )
7247               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7248        ) {
7249         if (float128_is_signaling_nan(a, status)
7250          || float128_is_signaling_nan(b, status)) {
7251             float_raise(float_flag_invalid, status);
7252         }
7253         return 1;
7254     }
7255     return 0;
7256 }
7257 
7258 /* misc functions */
7259 float32 uint32_to_float32(uint32_t a, float_status *status)
7260 {
7261     return int64_to_float32(a, status);
7262 }
7263 
7264 float64 uint32_to_float64(uint32_t a, float_status *status)
7265 {
7266     return int64_to_float64(a, status);
7267 }
7268 
7269 uint32_t float32_to_uint32(float32 a, float_status *status)
7270 {
7271     int64_t v;
7272     uint32_t res;
7273     int old_exc_flags = get_float_exception_flags(status);
7274 
7275     v = float32_to_int64(a, status);
7276     if (v < 0) {
7277         res = 0;
7278     } else if (v > 0xffffffff) {
7279         res = 0xffffffff;
7280     } else {
7281         return v;
7282     }
7283     set_float_exception_flags(old_exc_flags, status);
7284     float_raise(float_flag_invalid, status);
7285     return res;
7286 }
7287 
7288 uint32_t float32_to_uint32_round_to_zero(float32 a, float_status *status)
7289 {
7290     int64_t v;
7291     uint32_t res;
7292     int old_exc_flags = get_float_exception_flags(status);
7293 
7294     v = float32_to_int64_round_to_zero(a, status);
7295     if (v < 0) {
7296         res = 0;
7297     } else if (v > 0xffffffff) {
7298         res = 0xffffffff;
7299     } else {
7300         return v;
7301     }
7302     set_float_exception_flags(old_exc_flags, status);
7303     float_raise(float_flag_invalid, status);
7304     return res;
7305 }
7306 
7307 int16_t float32_to_int16(float32 a, float_status *status)
7308 {
7309     int32_t v;
7310     int16_t res;
7311     int old_exc_flags = get_float_exception_flags(status);
7312 
7313     v = float32_to_int32(a, status);
7314     if (v < -0x8000) {
7315         res = -0x8000;
7316     } else if (v > 0x7fff) {
7317         res = 0x7fff;
7318     } else {
7319         return v;
7320     }
7321 
7322     set_float_exception_flags(old_exc_flags, status);
7323     float_raise(float_flag_invalid, status);
7324     return res;
7325 }
7326 
7327 uint16_t float32_to_uint16(float32 a, float_status *status)
7328 {
7329     int32_t v;
7330     uint16_t res;
7331     int old_exc_flags = get_float_exception_flags(status);
7332 
7333     v = float32_to_int32(a, status);
7334     if (v < 0) {
7335         res = 0;
7336     } else if (v > 0xffff) {
7337         res = 0xffff;
7338     } else {
7339         return v;
7340     }
7341 
7342     set_float_exception_flags(old_exc_flags, status);
7343     float_raise(float_flag_invalid, status);
7344     return res;
7345 }
7346 
7347 uint16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7348 {
7349     int64_t v;
7350     uint16_t res;
7351     int old_exc_flags = get_float_exception_flags(status);
7352 
7353     v = float32_to_int64_round_to_zero(a, status);
7354     if (v < 0) {
7355         res = 0;
7356     } else if (v > 0xffff) {
7357         res = 0xffff;
7358     } else {
7359         return v;
7360     }
7361     set_float_exception_flags(old_exc_flags, status);
7362     float_raise(float_flag_invalid, status);
7363     return res;
7364 }
7365 
7366 uint32_t float64_to_uint32(float64 a, float_status *status)
7367 {
7368     uint64_t v;
7369     uint32_t res;
7370     int old_exc_flags = get_float_exception_flags(status);
7371 
7372     v = float64_to_uint64(a, status);
7373     if (v > 0xffffffff) {
7374         res = 0xffffffff;
7375     } else {
7376         return v;
7377     }
7378     set_float_exception_flags(old_exc_flags, status);
7379     float_raise(float_flag_invalid, status);
7380     return res;
7381 }
7382 
7383 uint32_t float64_to_uint32_round_to_zero(float64 a, float_status *status)
7384 {
7385     uint64_t v;
7386     uint32_t res;
7387     int old_exc_flags = get_float_exception_flags(status);
7388 
7389     v = float64_to_uint64_round_to_zero(a, status);
7390     if (v > 0xffffffff) {
7391         res = 0xffffffff;
7392     } else {
7393         return v;
7394     }
7395     set_float_exception_flags(old_exc_flags, status);
7396     float_raise(float_flag_invalid, status);
7397     return res;
7398 }
7399 
7400 int16_t float64_to_int16(float64 a, float_status *status)
7401 {
7402     int64_t v;
7403     int16_t res;
7404     int old_exc_flags = get_float_exception_flags(status);
7405 
7406     v = float64_to_int32(a, status);
7407     if (v < -0x8000) {
7408         res = -0x8000;
7409     } else if (v > 0x7fff) {
7410         res = 0x7fff;
7411     } else {
7412         return v;
7413     }
7414 
7415     set_float_exception_flags(old_exc_flags, status);
7416     float_raise(float_flag_invalid, status);
7417     return res;
7418 }
7419 
7420 uint16_t float64_to_uint16(float64 a, float_status *status)
7421 {
7422     int64_t v;
7423     uint16_t res;
7424     int old_exc_flags = get_float_exception_flags(status);
7425 
7426     v = float64_to_int32(a, status);
7427     if (v < 0) {
7428         res = 0;
7429     } else if (v > 0xffff) {
7430         res = 0xffff;
7431     } else {
7432         return v;
7433     }
7434 
7435     set_float_exception_flags(old_exc_flags, status);
7436     float_raise(float_flag_invalid, status);
7437     return res;
7438 }
7439 
7440 uint16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7441 {
7442     int64_t v;
7443     uint16_t res;
7444     int old_exc_flags = get_float_exception_flags(status);
7445 
7446     v = float64_to_int64_round_to_zero(a, status);
7447     if (v < 0) {
7448         res = 0;
7449     } else if (v > 0xffff) {
7450         res = 0xffff;
7451     } else {
7452         return v;
7453     }
7454     set_float_exception_flags(old_exc_flags, status);
7455     float_raise(float_flag_invalid, status);
7456     return res;
7457 }
7458 
7459 /*----------------------------------------------------------------------------
7460 | Returns the result of converting the double-precision floating-point value
7461 | `a' to the 64-bit unsigned integer format.  The conversion is
7462 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7463 | Arithmetic---which means in particular that the conversion is rounded
7464 | according to the current rounding mode.  If `a' is a NaN, the largest
7465 | positive integer is returned.  If the conversion overflows, the
7466 | largest unsigned integer is returned.  If 'a' is negative, the value is
7467 | rounded and zero is returned; negative values that do not round to zero
7468 | will raise the inexact exception.
7469 *----------------------------------------------------------------------------*/
7470 
7471 uint64_t float64_to_uint64(float64 a, float_status *status)
7472 {
7473     flag aSign;
7474     int aExp;
7475     int shiftCount;
7476     uint64_t aSig, aSigExtra;
7477     a = float64_squash_input_denormal(a, status);
7478 
7479     aSig = extractFloat64Frac(a);
7480     aExp = extractFloat64Exp(a);
7481     aSign = extractFloat64Sign(a);
7482     if (aSign && (aExp > 1022)) {
7483         float_raise(float_flag_invalid, status);
7484         if (float64_is_any_nan(a)) {
7485             return LIT64(0xFFFFFFFFFFFFFFFF);
7486         } else {
7487             return 0;
7488         }
7489     }
7490     if (aExp) {
7491         aSig |= LIT64(0x0010000000000000);
7492     }
7493     shiftCount = 0x433 - aExp;
7494     if (shiftCount <= 0) {
7495         if (0x43E < aExp) {
7496             float_raise(float_flag_invalid, status);
7497             return LIT64(0xFFFFFFFFFFFFFFFF);
7498         }
7499         aSigExtra = 0;
7500         aSig <<= -shiftCount;
7501     } else {
7502         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7503     }
7504     return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7505 }
7506 
7507 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7508 {
7509     signed char current_rounding_mode = status->float_rounding_mode;
7510     set_float_rounding_mode(float_round_to_zero, status);
7511     uint64_t v = float64_to_uint64(a, status);
7512     set_float_rounding_mode(current_rounding_mode, status);
7513     return v;
7514 }
7515 
7516 #define COMPARE(s, nan_exp)                                                  \
7517 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7518                                       int is_quiet, float_status *status)    \
7519 {                                                                            \
7520     flag aSign, bSign;                                                       \
7521     uint ## s ## _t av, bv;                                                  \
7522     a = float ## s ## _squash_input_denormal(a, status);                     \
7523     b = float ## s ## _squash_input_denormal(b, status);                     \
7524                                                                              \
7525     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7526          extractFloat ## s ## Frac( a ) ) ||                                 \
7527         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7528           extractFloat ## s ## Frac( b ) )) {                                \
7529         if (!is_quiet ||                                                     \
7530             float ## s ## _is_signaling_nan(a, status) ||                  \
7531             float ## s ## _is_signaling_nan(b, status)) {                 \
7532             float_raise(float_flag_invalid, status);                         \
7533         }                                                                    \
7534         return float_relation_unordered;                                     \
7535     }                                                                        \
7536     aSign = extractFloat ## s ## Sign( a );                                  \
7537     bSign = extractFloat ## s ## Sign( b );                                  \
7538     av = float ## s ## _val(a);                                              \
7539     bv = float ## s ## _val(b);                                              \
7540     if ( aSign != bSign ) {                                                  \
7541         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7542             /* zero case */                                                  \
7543             return float_relation_equal;                                     \
7544         } else {                                                             \
7545             return 1 - (2 * aSign);                                          \
7546         }                                                                    \
7547     } else {                                                                 \
7548         if (av == bv) {                                                      \
7549             return float_relation_equal;                                     \
7550         } else {                                                             \
7551             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7552         }                                                                    \
7553     }                                                                        \
7554 }                                                                            \
7555                                                                              \
7556 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7557 {                                                                            \
7558     return float ## s ## _compare_internal(a, b, 0, status);                 \
7559 }                                                                            \
7560                                                                              \
7561 int float ## s ## _compare_quiet(float ## s a, float ## s b,                 \
7562                                  float_status *status)                       \
7563 {                                                                            \
7564     return float ## s ## _compare_internal(a, b, 1, status);                 \
7565 }
7566 
7567 COMPARE(32, 0xff)
7568 COMPARE(64, 0x7ff)
7569 
7570 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7571                                             int is_quiet, float_status *status)
7572 {
7573     flag aSign, bSign;
7574 
7575     if (floatx80_invalid_encoding(a) || floatx80_invalid_encoding(b)) {
7576         float_raise(float_flag_invalid, status);
7577         return float_relation_unordered;
7578     }
7579     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7580           ( extractFloatx80Frac( a )<<1 ) ) ||
7581         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7582           ( extractFloatx80Frac( b )<<1 ) )) {
7583         if (!is_quiet ||
7584             floatx80_is_signaling_nan(a, status) ||
7585             floatx80_is_signaling_nan(b, status)) {
7586             float_raise(float_flag_invalid, status);
7587         }
7588         return float_relation_unordered;
7589     }
7590     aSign = extractFloatx80Sign( a );
7591     bSign = extractFloatx80Sign( b );
7592     if ( aSign != bSign ) {
7593 
7594         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7595              ( ( a.low | b.low ) == 0 ) ) {
7596             /* zero case */
7597             return float_relation_equal;
7598         } else {
7599             return 1 - (2 * aSign);
7600         }
7601     } else {
7602         if (a.low == b.low && a.high == b.high) {
7603             return float_relation_equal;
7604         } else {
7605             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7606         }
7607     }
7608 }
7609 
7610 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7611 {
7612     return floatx80_compare_internal(a, b, 0, status);
7613 }
7614 
7615 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7616 {
7617     return floatx80_compare_internal(a, b, 1, status);
7618 }
7619 
7620 static inline int float128_compare_internal(float128 a, float128 b,
7621                                             int is_quiet, float_status *status)
7622 {
7623     flag aSign, bSign;
7624 
7625     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7626           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7627         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7628           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7629         if (!is_quiet ||
7630             float128_is_signaling_nan(a, status) ||
7631             float128_is_signaling_nan(b, status)) {
7632             float_raise(float_flag_invalid, status);
7633         }
7634         return float_relation_unordered;
7635     }
7636     aSign = extractFloat128Sign( a );
7637     bSign = extractFloat128Sign( b );
7638     if ( aSign != bSign ) {
7639         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7640             /* zero case */
7641             return float_relation_equal;
7642         } else {
7643             return 1 - (2 * aSign);
7644         }
7645     } else {
7646         if (a.low == b.low && a.high == b.high) {
7647             return float_relation_equal;
7648         } else {
7649             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7650         }
7651     }
7652 }
7653 
7654 int float128_compare(float128 a, float128 b, float_status *status)
7655 {
7656     return float128_compare_internal(a, b, 0, status);
7657 }
7658 
7659 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7660 {
7661     return float128_compare_internal(a, b, 1, status);
7662 }
7663 
7664 /* min() and max() functions. These can't be implemented as
7665  * 'compare and pick one input' because that would mishandle
7666  * NaNs and +0 vs -0.
7667  *
7668  * minnum() and maxnum() functions. These are similar to the min()
7669  * and max() functions but if one of the arguments is a QNaN and
7670  * the other is numerical then the numerical argument is returned.
7671  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7672  * and maxNum() operations. min() and max() are the typical min/max
7673  * semantics provided by many CPUs which predate that specification.
7674  *
7675  * minnummag() and maxnummag() functions correspond to minNumMag()
7676  * and minNumMag() from the IEEE-754 2008.
7677  */
7678 #define MINMAX(s)                                                       \
7679 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7680                                                int ismin, int isieee,   \
7681                                                int ismag,               \
7682                                                float_status *status)    \
7683 {                                                                       \
7684     flag aSign, bSign;                                                  \
7685     uint ## s ## _t av, bv, aav, abv;                                   \
7686     a = float ## s ## _squash_input_denormal(a, status);                \
7687     b = float ## s ## _squash_input_denormal(b, status);                \
7688     if (float ## s ## _is_any_nan(a) ||                                 \
7689         float ## s ## _is_any_nan(b)) {                                 \
7690         if (isieee) {                                                   \
7691             if (float ## s ## _is_quiet_nan(a, status) &&               \
7692                 !float ## s ##_is_any_nan(b)) {                         \
7693                 return b;                                               \
7694             } else if (float ## s ## _is_quiet_nan(b, status) &&        \
7695                        !float ## s ## _is_any_nan(a)) {                \
7696                 return a;                                               \
7697             }                                                           \
7698         }                                                               \
7699         return propagateFloat ## s ## NaN(a, b, status);                \
7700     }                                                                   \
7701     aSign = extractFloat ## s ## Sign(a);                               \
7702     bSign = extractFloat ## s ## Sign(b);                               \
7703     av = float ## s ## _val(a);                                         \
7704     bv = float ## s ## _val(b);                                         \
7705     if (ismag) {                                                        \
7706         aav = float ## s ## _abs(av);                                   \
7707         abv = float ## s ## _abs(bv);                                   \
7708         if (aav != abv) {                                               \
7709             if (ismin) {                                                \
7710                 return (aav < abv) ? a : b;                             \
7711             } else {                                                    \
7712                 return (aav < abv) ? b : a;                             \
7713             }                                                           \
7714         }                                                               \
7715     }                                                                   \
7716     if (aSign != bSign) {                                               \
7717         if (ismin) {                                                    \
7718             return aSign ? a : b;                                       \
7719         } else {                                                        \
7720             return aSign ? b : a;                                       \
7721         }                                                               \
7722     } else {                                                            \
7723         if (ismin) {                                                    \
7724             return (aSign ^ (av < bv)) ? a : b;                         \
7725         } else {                                                        \
7726             return (aSign ^ (av < bv)) ? b : a;                         \
7727         }                                                               \
7728     }                                                                   \
7729 }                                                                       \
7730                                                                         \
7731 float ## s float ## s ## _min(float ## s a, float ## s b,               \
7732                               float_status *status)                     \
7733 {                                                                       \
7734     return float ## s ## _minmax(a, b, 1, 0, 0, status);                \
7735 }                                                                       \
7736                                                                         \
7737 float ## s float ## s ## _max(float ## s a, float ## s b,               \
7738                               float_status *status)                     \
7739 {                                                                       \
7740     return float ## s ## _minmax(a, b, 0, 0, 0, status);                \
7741 }                                                                       \
7742                                                                         \
7743 float ## s float ## s ## _minnum(float ## s a, float ## s b,            \
7744                                  float_status *status)                  \
7745 {                                                                       \
7746     return float ## s ## _minmax(a, b, 1, 1, 0, status);                \
7747 }                                                                       \
7748                                                                         \
7749 float ## s float ## s ## _maxnum(float ## s a, float ## s b,            \
7750                                  float_status *status)                  \
7751 {                                                                       \
7752     return float ## s ## _minmax(a, b, 0, 1, 0, status);                \
7753 }                                                                       \
7754                                                                         \
7755 float ## s float ## s ## _minnummag(float ## s a, float ## s b,         \
7756                                     float_status *status)               \
7757 {                                                                       \
7758     return float ## s ## _minmax(a, b, 1, 1, 1, status);                \
7759 }                                                                       \
7760                                                                         \
7761 float ## s float ## s ## _maxnummag(float ## s a, float ## s b,         \
7762                                     float_status *status)               \
7763 {                                                                       \
7764     return float ## s ## _minmax(a, b, 0, 1, 1, status);                \
7765 }
7766 
7767 MINMAX(32)
7768 MINMAX(64)
7769 
7770 
7771 /* Multiply A by 2 raised to the power N.  */
7772 float32 float32_scalbn(float32 a, int n, float_status *status)
7773 {
7774     flag aSign;
7775     int16_t aExp;
7776     uint32_t aSig;
7777 
7778     a = float32_squash_input_denormal(a, status);
7779     aSig = extractFloat32Frac( a );
7780     aExp = extractFloat32Exp( a );
7781     aSign = extractFloat32Sign( a );
7782 
7783     if ( aExp == 0xFF ) {
7784         if ( aSig ) {
7785             return propagateFloat32NaN(a, a, status);
7786         }
7787         return a;
7788     }
7789     if (aExp != 0) {
7790         aSig |= 0x00800000;
7791     } else if (aSig == 0) {
7792         return a;
7793     } else {
7794         aExp++;
7795     }
7796 
7797     if (n > 0x200) {
7798         n = 0x200;
7799     } else if (n < -0x200) {
7800         n = -0x200;
7801     }
7802 
7803     aExp += n - 1;
7804     aSig <<= 7;
7805     return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7806 }
7807 
7808 float64 float64_scalbn(float64 a, int n, float_status *status)
7809 {
7810     flag aSign;
7811     int16_t aExp;
7812     uint64_t aSig;
7813 
7814     a = float64_squash_input_denormal(a, status);
7815     aSig = extractFloat64Frac( a );
7816     aExp = extractFloat64Exp( a );
7817     aSign = extractFloat64Sign( a );
7818 
7819     if ( aExp == 0x7FF ) {
7820         if ( aSig ) {
7821             return propagateFloat64NaN(a, a, status);
7822         }
7823         return a;
7824     }
7825     if (aExp != 0) {
7826         aSig |= LIT64( 0x0010000000000000 );
7827     } else if (aSig == 0) {
7828         return a;
7829     } else {
7830         aExp++;
7831     }
7832 
7833     if (n > 0x1000) {
7834         n = 0x1000;
7835     } else if (n < -0x1000) {
7836         n = -0x1000;
7837     }
7838 
7839     aExp += n - 1;
7840     aSig <<= 10;
7841     return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7842 }
7843 
7844 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7845 {
7846     flag aSign;
7847     int32_t aExp;
7848     uint64_t aSig;
7849 
7850     if (floatx80_invalid_encoding(a)) {
7851         float_raise(float_flag_invalid, status);
7852         return floatx80_default_nan(status);
7853     }
7854     aSig = extractFloatx80Frac( a );
7855     aExp = extractFloatx80Exp( a );
7856     aSign = extractFloatx80Sign( a );
7857 
7858     if ( aExp == 0x7FFF ) {
7859         if ( aSig<<1 ) {
7860             return propagateFloatx80NaN(a, a, status);
7861         }
7862         return a;
7863     }
7864 
7865     if (aExp == 0) {
7866         if (aSig == 0) {
7867             return a;
7868         }
7869         aExp++;
7870     }
7871 
7872     if (n > 0x10000) {
7873         n = 0x10000;
7874     } else if (n < -0x10000) {
7875         n = -0x10000;
7876     }
7877 
7878     aExp += n;
7879     return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7880                                          aSign, aExp, aSig, 0, status);
7881 }
7882 
7883 float128 float128_scalbn(float128 a, int n, float_status *status)
7884 {
7885     flag aSign;
7886     int32_t aExp;
7887     uint64_t aSig0, aSig1;
7888 
7889     aSig1 = extractFloat128Frac1( a );
7890     aSig0 = extractFloat128Frac0( a );
7891     aExp = extractFloat128Exp( a );
7892     aSign = extractFloat128Sign( a );
7893     if ( aExp == 0x7FFF ) {
7894         if ( aSig0 | aSig1 ) {
7895             return propagateFloat128NaN(a, a, status);
7896         }
7897         return a;
7898     }
7899     if (aExp != 0) {
7900         aSig0 |= LIT64( 0x0001000000000000 );
7901     } else if (aSig0 == 0 && aSig1 == 0) {
7902         return a;
7903     } else {
7904         aExp++;
7905     }
7906 
7907     if (n > 0x10000) {
7908         n = 0x10000;
7909     } else if (n < -0x10000) {
7910         n = -0x10000;
7911     }
7912 
7913     aExp += n - 1;
7914     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7915                                          , status);
7916 
7917 }
7918