xref: /openbmc/qemu/fpu/softfloat.c (revision 8f1e884b)
1 /*
2  * QEMU float support
3  *
4  * Derived from SoftFloat.
5  */
6 
7 /*============================================================================
8 
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11 
12 Written by John R. Hauser.  This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704.  Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980.  The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21 
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30 
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35 
36 =============================================================================*/
37 
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39  * target-dependent and needs the TARGET_* macros.
40  */
41 #include "config.h"
42 
43 #include "fpu/softfloat.h"
44 
45 /* We only need stdlib for abort() */
46 #include <stdlib.h>
47 
48 /*----------------------------------------------------------------------------
49 | Primitive arithmetic functions, including multi-word arithmetic, and
50 | division and square root approximations.  (Can be specialized to target if
51 | desired.)
52 *----------------------------------------------------------------------------*/
53 #include "softfloat-macros.h"
54 
55 /*----------------------------------------------------------------------------
56 | Functions and definitions to determine:  (1) whether tininess for underflow
57 | is detected before or after rounding by default, (2) what (if anything)
58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 | are propagated from function inputs to output.  These details are target-
61 | specific.
62 *----------------------------------------------------------------------------*/
63 #include "softfloat-specialize.h"
64 
65 /*----------------------------------------------------------------------------
66 | Returns the fraction bits of the half-precision floating-point value `a'.
67 *----------------------------------------------------------------------------*/
68 
69 INLINE uint32_t extractFloat16Frac(float16 a)
70 {
71     return float16_val(a) & 0x3ff;
72 }
73 
74 /*----------------------------------------------------------------------------
75 | Returns the exponent bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77 
78 INLINE int_fast16_t extractFloat16Exp(float16 a)
79 {
80     return (float16_val(a) >> 10) & 0x1f;
81 }
82 
83 /*----------------------------------------------------------------------------
84 | Returns the sign bit of the single-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86 
87 INLINE flag extractFloat16Sign(float16 a)
88 {
89     return float16_val(a)>>15;
90 }
91 
92 /*----------------------------------------------------------------------------
93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
95 | input.  If `zSign' is 1, the input is negated before being converted to an
96 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
97 | is simply rounded to an integer, with the inexact exception raised if the
98 | input cannot be represented exactly as an integer.  However, if the fixed-
99 | point input is too large, the invalid exception is raised and the largest
100 | positive or negative integer is returned.
101 *----------------------------------------------------------------------------*/
102 
103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
104 {
105     int8 roundingMode;
106     flag roundNearestEven;
107     int8 roundIncrement, roundBits;
108     int32_t z;
109 
110     roundingMode = STATUS(float_rounding_mode);
111     roundNearestEven = ( roundingMode == float_round_nearest_even );
112     switch (roundingMode) {
113     case float_round_nearest_even:
114     case float_round_ties_away:
115         roundIncrement = 0x40;
116         break;
117     case float_round_to_zero:
118         roundIncrement = 0;
119         break;
120     case float_round_up:
121         roundIncrement = zSign ? 0 : 0x7f;
122         break;
123     case float_round_down:
124         roundIncrement = zSign ? 0x7f : 0;
125         break;
126     default:
127         abort();
128     }
129     roundBits = absZ & 0x7F;
130     absZ = ( absZ + roundIncrement )>>7;
131     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132     z = absZ;
133     if ( zSign ) z = - z;
134     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135         float_raise( float_flag_invalid STATUS_VAR);
136         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137     }
138     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139     return z;
140 
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer.  However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154 
155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
156 {
157     int8 roundingMode;
158     flag roundNearestEven, increment;
159     int64_t z;
160 
161     roundingMode = STATUS(float_rounding_mode);
162     roundNearestEven = ( roundingMode == float_round_nearest_even );
163     switch (roundingMode) {
164     case float_round_nearest_even:
165     case float_round_ties_away:
166         increment = ((int64_t) absZ1 < 0);
167         break;
168     case float_round_to_zero:
169         increment = 0;
170         break;
171     case float_round_up:
172         increment = !zSign && absZ1;
173         break;
174     case float_round_down:
175         increment = zSign && absZ1;
176         break;
177     default:
178         abort();
179     }
180     if ( increment ) {
181         ++absZ0;
182         if ( absZ0 == 0 ) goto overflow;
183         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184     }
185     z = absZ0;
186     if ( zSign ) z = - z;
187     if ( z && ( ( z < 0 ) ^ zSign ) ) {
188  overflow:
189         float_raise( float_flag_invalid STATUS_VAR);
190         return
191               zSign ? (int64_t) LIT64( 0x8000000000000000 )
192             : LIT64( 0x7FFFFFFFFFFFFFFF );
193     }
194     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195     return z;
196 
197 }
198 
199 /*----------------------------------------------------------------------------
200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
203 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
204 | with the inexact exception raised if the input cannot be represented exactly
205 | as an integer.  However, if the fixed-point input is too large, the invalid
206 | exception is raised and the largest unsigned integer is returned.
207 *----------------------------------------------------------------------------*/
208 
209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210                                 uint64_t absZ1 STATUS_PARAM)
211 {
212     int8 roundingMode;
213     flag roundNearestEven, increment;
214 
215     roundingMode = STATUS(float_rounding_mode);
216     roundNearestEven = (roundingMode == float_round_nearest_even);
217     switch (roundingMode) {
218     case float_round_nearest_even:
219     case float_round_ties_away:
220         increment = ((int64_t)absZ1 < 0);
221         break;
222     case float_round_to_zero:
223         increment = 0;
224         break;
225     case float_round_up:
226         increment = !zSign && absZ1;
227         break;
228     case float_round_down:
229         increment = zSign && absZ1;
230         break;
231     default:
232         abort();
233     }
234     if (increment) {
235         ++absZ0;
236         if (absZ0 == 0) {
237             float_raise(float_flag_invalid STATUS_VAR);
238             return LIT64(0xFFFFFFFFFFFFFFFF);
239         }
240         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241     }
242 
243     if (zSign && absZ0) {
244         float_raise(float_flag_invalid STATUS_VAR);
245         return 0;
246     }
247 
248     if (absZ1) {
249         STATUS(float_exception_flags) |= float_flag_inexact;
250     }
251     return absZ0;
252 }
253 
254 /*----------------------------------------------------------------------------
255 | Returns the fraction bits of the single-precision floating-point value `a'.
256 *----------------------------------------------------------------------------*/
257 
258 INLINE uint32_t extractFloat32Frac( float32 a )
259 {
260 
261     return float32_val(a) & 0x007FFFFF;
262 
263 }
264 
265 /*----------------------------------------------------------------------------
266 | Returns the exponent bits of the single-precision floating-point value `a'.
267 *----------------------------------------------------------------------------*/
268 
269 INLINE int_fast16_t extractFloat32Exp(float32 a)
270 {
271 
272     return ( float32_val(a)>>23 ) & 0xFF;
273 
274 }
275 
276 /*----------------------------------------------------------------------------
277 | Returns the sign bit of the single-precision floating-point value `a'.
278 *----------------------------------------------------------------------------*/
279 
280 INLINE flag extractFloat32Sign( float32 a )
281 {
282 
283     return float32_val(a)>>31;
284 
285 }
286 
287 /*----------------------------------------------------------------------------
288 | If `a' is denormal and we are in flush-to-zero mode then set the
289 | input-denormal exception and return zero. Otherwise just return the value.
290 *----------------------------------------------------------------------------*/
291 float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
292 {
293     if (STATUS(flush_inputs_to_zero)) {
294         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295             float_raise(float_flag_input_denormal STATUS_VAR);
296             return make_float32(float32_val(a) & 0x80000000);
297         }
298     }
299     return a;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Normalizes the subnormal single-precision floating-point value represented
304 | by the denormalized significand `aSig'.  The normalized exponent and
305 | significand are stored at the locations pointed to by `zExpPtr' and
306 | `zSigPtr', respectively.
307 *----------------------------------------------------------------------------*/
308 
309 static void
310  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311 {
312     int8 shiftCount;
313 
314     shiftCount = countLeadingZeros32( aSig ) - 8;
315     *zSigPtr = aSig<<shiftCount;
316     *zExpPtr = 1 - shiftCount;
317 
318 }
319 
320 /*----------------------------------------------------------------------------
321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322 | single-precision floating-point value, returning the result.  After being
323 | shifted into the proper positions, the three fields are simply added
324 | together to form the result.  This means that any integer portion of `zSig'
325 | will be added into the exponent.  Since a properly normalized significand
326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
327 | than the desired result exponent whenever `zSig' is a complete, normalized
328 | significand.
329 *----------------------------------------------------------------------------*/
330 
331 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332 {
333 
334     return make_float32(
335           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336 
337 }
338 
339 /*----------------------------------------------------------------------------
340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341 | and significand `zSig', and returns the proper single-precision floating-
342 | point value corresponding to the abstract input.  Ordinarily, the abstract
343 | value is simply rounded and packed into the single-precision format, with
344 | the inexact exception raised if the abstract input cannot be represented
345 | exactly.  However, if the abstract value is too large, the overflow and
346 | inexact exceptions are raised and an infinity or maximal finite value is
347 | returned.  If the abstract value is too small, the input value is rounded to
348 | a subnormal number, and the underflow and inexact exceptions are raised if
349 | the abstract input cannot be represented exactly as a subnormal single-
350 | precision floating-point number.
351 |     The input significand `zSig' has its binary point between bits 30
352 | and 29, which is 7 bits to the left of the usual location.  This shifted
353 | significand must be normalized or smaller.  If `zSig' is not normalized,
354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
355 | and it must not require rounding.  In the usual case that `zSig' is
356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
358 | Binary Floating-Point Arithmetic.
359 *----------------------------------------------------------------------------*/
360 
361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
362 {
363     int8 roundingMode;
364     flag roundNearestEven;
365     int8 roundIncrement, roundBits;
366     flag isTiny;
367 
368     roundingMode = STATUS(float_rounding_mode);
369     roundNearestEven = ( roundingMode == float_round_nearest_even );
370     switch (roundingMode) {
371     case float_round_nearest_even:
372     case float_round_ties_away:
373         roundIncrement = 0x40;
374         break;
375     case float_round_to_zero:
376         roundIncrement = 0;
377         break;
378     case float_round_up:
379         roundIncrement = zSign ? 0 : 0x7f;
380         break;
381     case float_round_down:
382         roundIncrement = zSign ? 0x7f : 0;
383         break;
384     default:
385         abort();
386         break;
387     }
388     roundBits = zSig & 0x7F;
389     if ( 0xFD <= (uint16_t) zExp ) {
390         if (    ( 0xFD < zExp )
391              || (    ( zExp == 0xFD )
392                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393            ) {
394             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
395             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
396         }
397         if ( zExp < 0 ) {
398             if (STATUS(flush_to_zero)) {
399                 float_raise(float_flag_output_denormal STATUS_VAR);
400                 return packFloat32(zSign, 0, 0);
401             }
402             isTiny =
403                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404                 || ( zExp < -1 )
405                 || ( zSig + roundIncrement < 0x80000000 );
406             shift32RightJamming( zSig, - zExp, &zSig );
407             zExp = 0;
408             roundBits = zSig & 0x7F;
409             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410         }
411     }
412     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413     zSig = ( zSig + roundIncrement )>>7;
414     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415     if ( zSig == 0 ) zExp = 0;
416     return packFloat32( zSign, zExp, zSig );
417 
418 }
419 
420 /*----------------------------------------------------------------------------
421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422 | and significand `zSig', and returns the proper single-precision floating-
423 | point value corresponding to the abstract input.  This routine is just like
424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426 | floating-point exponent.
427 *----------------------------------------------------------------------------*/
428 
429 static float32
430  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
431 {
432     int8 shiftCount;
433 
434     shiftCount = countLeadingZeros32( zSig ) - 1;
435     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436 
437 }
438 
439 /*----------------------------------------------------------------------------
440 | Returns the fraction bits of the double-precision floating-point value `a'.
441 *----------------------------------------------------------------------------*/
442 
443 INLINE uint64_t extractFloat64Frac( float64 a )
444 {
445 
446     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
447 
448 }
449 
450 /*----------------------------------------------------------------------------
451 | Returns the exponent bits of the double-precision floating-point value `a'.
452 *----------------------------------------------------------------------------*/
453 
454 INLINE int_fast16_t extractFloat64Exp(float64 a)
455 {
456 
457     return ( float64_val(a)>>52 ) & 0x7FF;
458 
459 }
460 
461 /*----------------------------------------------------------------------------
462 | Returns the sign bit of the double-precision floating-point value `a'.
463 *----------------------------------------------------------------------------*/
464 
465 INLINE flag extractFloat64Sign( float64 a )
466 {
467 
468     return float64_val(a)>>63;
469 
470 }
471 
472 /*----------------------------------------------------------------------------
473 | If `a' is denormal and we are in flush-to-zero mode then set the
474 | input-denormal exception and return zero. Otherwise just return the value.
475 *----------------------------------------------------------------------------*/
476 float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477 {
478     if (STATUS(flush_inputs_to_zero)) {
479         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480             float_raise(float_flag_input_denormal STATUS_VAR);
481             return make_float64(float64_val(a) & (1ULL << 63));
482         }
483     }
484     return a;
485 }
486 
487 /*----------------------------------------------------------------------------
488 | Normalizes the subnormal double-precision floating-point value represented
489 | by the denormalized significand `aSig'.  The normalized exponent and
490 | significand are stored at the locations pointed to by `zExpPtr' and
491 | `zSigPtr', respectively.
492 *----------------------------------------------------------------------------*/
493 
494 static void
495  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
496 {
497     int8 shiftCount;
498 
499     shiftCount = countLeadingZeros64( aSig ) - 11;
500     *zSigPtr = aSig<<shiftCount;
501     *zExpPtr = 1 - shiftCount;
502 
503 }
504 
505 /*----------------------------------------------------------------------------
506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507 | double-precision floating-point value, returning the result.  After being
508 | shifted into the proper positions, the three fields are simply added
509 | together to form the result.  This means that any integer portion of `zSig'
510 | will be added into the exponent.  Since a properly normalized significand
511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
512 | than the desired result exponent whenever `zSig' is a complete, normalized
513 | significand.
514 *----------------------------------------------------------------------------*/
515 
516 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
517 {
518 
519     return make_float64(
520         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
521 
522 }
523 
524 /*----------------------------------------------------------------------------
525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526 | and significand `zSig', and returns the proper double-precision floating-
527 | point value corresponding to the abstract input.  Ordinarily, the abstract
528 | value is simply rounded and packed into the double-precision format, with
529 | the inexact exception raised if the abstract input cannot be represented
530 | exactly.  However, if the abstract value is too large, the overflow and
531 | inexact exceptions are raised and an infinity or maximal finite value is
532 | returned.  If the abstract value is too small, the input value is rounded
533 | to a subnormal number, and the underflow and inexact exceptions are raised
534 | if the abstract input cannot be represented exactly as a subnormal double-
535 | precision floating-point number.
536 |     The input significand `zSig' has its binary point between bits 62
537 | and 61, which is 10 bits to the left of the usual location.  This shifted
538 | significand must be normalized or smaller.  If `zSig' is not normalized,
539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
540 | and it must not require rounding.  In the usual case that `zSig' is
541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
543 | Binary Floating-Point Arithmetic.
544 *----------------------------------------------------------------------------*/
545 
546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
547 {
548     int8 roundingMode;
549     flag roundNearestEven;
550     int_fast16_t roundIncrement, roundBits;
551     flag isTiny;
552 
553     roundingMode = STATUS(float_rounding_mode);
554     roundNearestEven = ( roundingMode == float_round_nearest_even );
555     switch (roundingMode) {
556     case float_round_nearest_even:
557     case float_round_ties_away:
558         roundIncrement = 0x200;
559         break;
560     case float_round_to_zero:
561         roundIncrement = 0;
562         break;
563     case float_round_up:
564         roundIncrement = zSign ? 0 : 0x3ff;
565         break;
566     case float_round_down:
567         roundIncrement = zSign ? 0x3ff : 0;
568         break;
569     default:
570         abort();
571     }
572     roundBits = zSig & 0x3FF;
573     if ( 0x7FD <= (uint16_t) zExp ) {
574         if (    ( 0x7FD < zExp )
575              || (    ( zExp == 0x7FD )
576                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
577            ) {
578             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
579             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
580         }
581         if ( zExp < 0 ) {
582             if (STATUS(flush_to_zero)) {
583                 float_raise(float_flag_output_denormal STATUS_VAR);
584                 return packFloat64(zSign, 0, 0);
585             }
586             isTiny =
587                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588                 || ( zExp < -1 )
589                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590             shift64RightJamming( zSig, - zExp, &zSig );
591             zExp = 0;
592             roundBits = zSig & 0x3FF;
593             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594         }
595     }
596     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597     zSig = ( zSig + roundIncrement )>>10;
598     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599     if ( zSig == 0 ) zExp = 0;
600     return packFloat64( zSign, zExp, zSig );
601 
602 }
603 
604 /*----------------------------------------------------------------------------
605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606 | and significand `zSig', and returns the proper double-precision floating-
607 | point value corresponding to the abstract input.  This routine is just like
608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610 | floating-point exponent.
611 *----------------------------------------------------------------------------*/
612 
613 static float64
614  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615 {
616     int8 shiftCount;
617 
618     shiftCount = countLeadingZeros64( zSig ) - 1;
619     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620 
621 }
622 
623 /*----------------------------------------------------------------------------
624 | Returns the fraction bits of the extended double-precision floating-point
625 | value `a'.
626 *----------------------------------------------------------------------------*/
627 
628 INLINE uint64_t extractFloatx80Frac( floatx80 a )
629 {
630 
631     return a.low;
632 
633 }
634 
635 /*----------------------------------------------------------------------------
636 | Returns the exponent bits of the extended double-precision floating-point
637 | value `a'.
638 *----------------------------------------------------------------------------*/
639 
640 INLINE int32 extractFloatx80Exp( floatx80 a )
641 {
642 
643     return a.high & 0x7FFF;
644 
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Returns the sign bit of the extended double-precision floating-point value
649 | `a'.
650 *----------------------------------------------------------------------------*/
651 
652 INLINE flag extractFloatx80Sign( floatx80 a )
653 {
654 
655     return a.high>>15;
656 
657 }
658 
659 /*----------------------------------------------------------------------------
660 | Normalizes the subnormal extended double-precision floating-point value
661 | represented by the denormalized significand `aSig'.  The normalized exponent
662 | and significand are stored at the locations pointed to by `zExpPtr' and
663 | `zSigPtr', respectively.
664 *----------------------------------------------------------------------------*/
665 
666 static void
667  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668 {
669     int8 shiftCount;
670 
671     shiftCount = countLeadingZeros64( aSig );
672     *zSigPtr = aSig<<shiftCount;
673     *zExpPtr = 1 - shiftCount;
674 
675 }
676 
677 /*----------------------------------------------------------------------------
678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679 | extended double-precision floating-point value, returning the result.
680 *----------------------------------------------------------------------------*/
681 
682 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
683 {
684     floatx80 z;
685 
686     z.low = zSig;
687     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
688     return z;
689 
690 }
691 
692 /*----------------------------------------------------------------------------
693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
695 | and returns the proper extended double-precision floating-point value
696 | corresponding to the abstract input.  Ordinarily, the abstract value is
697 | rounded and packed into the extended double-precision format, with the
698 | inexact exception raised if the abstract input cannot be represented
699 | exactly.  However, if the abstract value is too large, the overflow and
700 | inexact exceptions are raised and an infinity or maximal finite value is
701 | returned.  If the abstract value is too small, the input value is rounded to
702 | a subnormal number, and the underflow and inexact exceptions are raised if
703 | the abstract input cannot be represented exactly as a subnormal extended
704 | double-precision floating-point number.
705 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
706 | number of bits as single or double precision, respectively.  Otherwise, the
707 | result is rounded to the full precision of the extended double-precision
708 | format.
709 |     The input significand must be normalized or smaller.  If the input
710 | significand is not normalized, `zExp' must be 0; in that case, the result
711 | returned is a subnormal number, and it must not require rounding.  The
712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713 | Floating-Point Arithmetic.
714 *----------------------------------------------------------------------------*/
715 
716 static floatx80
717  roundAndPackFloatx80(
718      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719  STATUS_PARAM)
720 {
721     int8 roundingMode;
722     flag roundNearestEven, increment, isTiny;
723     int64 roundIncrement, roundMask, roundBits;
724 
725     roundingMode = STATUS(float_rounding_mode);
726     roundNearestEven = ( roundingMode == float_round_nearest_even );
727     if ( roundingPrecision == 80 ) goto precision80;
728     if ( roundingPrecision == 64 ) {
729         roundIncrement = LIT64( 0x0000000000000400 );
730         roundMask = LIT64( 0x00000000000007FF );
731     }
732     else if ( roundingPrecision == 32 ) {
733         roundIncrement = LIT64( 0x0000008000000000 );
734         roundMask = LIT64( 0x000000FFFFFFFFFF );
735     }
736     else {
737         goto precision80;
738     }
739     zSig0 |= ( zSig1 != 0 );
740     switch (roundingMode) {
741     case float_round_nearest_even:
742     case float_round_ties_away:
743         break;
744     case float_round_to_zero:
745         roundIncrement = 0;
746         break;
747     case float_round_up:
748         roundIncrement = zSign ? 0 : roundMask;
749         break;
750     case float_round_down:
751         roundIncrement = zSign ? roundMask : 0;
752         break;
753     default:
754         abort();
755     }
756     roundBits = zSig0 & roundMask;
757     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
758         if (    ( 0x7FFE < zExp )
759              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760            ) {
761             goto overflow;
762         }
763         if ( zExp <= 0 ) {
764             if (STATUS(flush_to_zero)) {
765                 float_raise(float_flag_output_denormal STATUS_VAR);
766                 return packFloatx80(zSign, 0, 0);
767             }
768             isTiny =
769                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770                 || ( zExp < 0 )
771                 || ( zSig0 <= zSig0 + roundIncrement );
772             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773             zExp = 0;
774             roundBits = zSig0 & roundMask;
775             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777             zSig0 += roundIncrement;
778             if ( (int64_t) zSig0 < 0 ) zExp = 1;
779             roundIncrement = roundMask + 1;
780             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781                 roundMask |= roundIncrement;
782             }
783             zSig0 &= ~ roundMask;
784             return packFloatx80( zSign, zExp, zSig0 );
785         }
786     }
787     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788     zSig0 += roundIncrement;
789     if ( zSig0 < roundIncrement ) {
790         ++zExp;
791         zSig0 = LIT64( 0x8000000000000000 );
792     }
793     roundIncrement = roundMask + 1;
794     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795         roundMask |= roundIncrement;
796     }
797     zSig0 &= ~ roundMask;
798     if ( zSig0 == 0 ) zExp = 0;
799     return packFloatx80( zSign, zExp, zSig0 );
800  precision80:
801     switch (roundingMode) {
802     case float_round_nearest_even:
803     case float_round_ties_away:
804         increment = ((int64_t)zSig1 < 0);
805         break;
806     case float_round_to_zero:
807         increment = 0;
808         break;
809     case float_round_up:
810         increment = !zSign && zSig1;
811         break;
812     case float_round_down:
813         increment = zSign && zSig1;
814         break;
815     default:
816         abort();
817     }
818     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
819         if (    ( 0x7FFE < zExp )
820              || (    ( zExp == 0x7FFE )
821                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822                   && increment
823                 )
824            ) {
825             roundMask = 0;
826  overflow:
827             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828             if (    ( roundingMode == float_round_to_zero )
829                  || ( zSign && ( roundingMode == float_round_up ) )
830                  || ( ! zSign && ( roundingMode == float_round_down ) )
831                ) {
832                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833             }
834             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835         }
836         if ( zExp <= 0 ) {
837             isTiny =
838                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839                 || ( zExp < 0 )
840                 || ! increment
841                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843             zExp = 0;
844             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
846             switch (roundingMode) {
847             case float_round_nearest_even:
848             case float_round_ties_away:
849                 increment = ((int64_t)zSig1 < 0);
850                 break;
851             case float_round_to_zero:
852                 increment = 0;
853                 break;
854             case float_round_up:
855                 increment = !zSign && zSig1;
856                 break;
857             case float_round_down:
858                 increment = zSign && zSig1;
859                 break;
860             default:
861                 abort();
862             }
863             if ( increment ) {
864                 ++zSig0;
865                 zSig0 &=
866                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
868             }
869             return packFloatx80( zSign, zExp, zSig0 );
870         }
871     }
872     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873     if ( increment ) {
874         ++zSig0;
875         if ( zSig0 == 0 ) {
876             ++zExp;
877             zSig0 = LIT64( 0x8000000000000000 );
878         }
879         else {
880             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
881         }
882     }
883     else {
884         if ( zSig0 == 0 ) zExp = 0;
885     }
886     return packFloatx80( zSign, zExp, zSig0 );
887 
888 }
889 
890 /*----------------------------------------------------------------------------
891 | Takes an abstract floating-point value having sign `zSign', exponent
892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893 | and returns the proper extended double-precision floating-point value
894 | corresponding to the abstract input.  This routine is just like
895 | `roundAndPackFloatx80' except that the input significand does not have to be
896 | normalized.
897 *----------------------------------------------------------------------------*/
898 
899 static floatx80
900  normalizeRoundAndPackFloatx80(
901      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
902  STATUS_PARAM)
903 {
904     int8 shiftCount;
905 
906     if ( zSig0 == 0 ) {
907         zSig0 = zSig1;
908         zSig1 = 0;
909         zExp -= 64;
910     }
911     shiftCount = countLeadingZeros64( zSig0 );
912     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913     zExp -= shiftCount;
914     return
915         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916 
917 }
918 
919 /*----------------------------------------------------------------------------
920 | Returns the least-significant 64 fraction bits of the quadruple-precision
921 | floating-point value `a'.
922 *----------------------------------------------------------------------------*/
923 
924 INLINE uint64_t extractFloat128Frac1( float128 a )
925 {
926 
927     return a.low;
928 
929 }
930 
931 /*----------------------------------------------------------------------------
932 | Returns the most-significant 48 fraction bits of the quadruple-precision
933 | floating-point value `a'.
934 *----------------------------------------------------------------------------*/
935 
936 INLINE uint64_t extractFloat128Frac0( float128 a )
937 {
938 
939     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940 
941 }
942 
943 /*----------------------------------------------------------------------------
944 | Returns the exponent bits of the quadruple-precision floating-point value
945 | `a'.
946 *----------------------------------------------------------------------------*/
947 
948 INLINE int32 extractFloat128Exp( float128 a )
949 {
950 
951     return ( a.high>>48 ) & 0x7FFF;
952 
953 }
954 
955 /*----------------------------------------------------------------------------
956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
957 *----------------------------------------------------------------------------*/
958 
959 INLINE flag extractFloat128Sign( float128 a )
960 {
961 
962     return a.high>>63;
963 
964 }
965 
966 /*----------------------------------------------------------------------------
967 | Normalizes the subnormal quadruple-precision floating-point value
968 | represented by the denormalized significand formed by the concatenation of
969 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
970 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
971 | significand are stored at the location pointed to by `zSig0Ptr', and the
972 | least significant 64 bits of the normalized significand are stored at the
973 | location pointed to by `zSig1Ptr'.
974 *----------------------------------------------------------------------------*/
975 
976 static void
977  normalizeFloat128Subnormal(
978      uint64_t aSig0,
979      uint64_t aSig1,
980      int32 *zExpPtr,
981      uint64_t *zSig0Ptr,
982      uint64_t *zSig1Ptr
983  )
984 {
985     int8 shiftCount;
986 
987     if ( aSig0 == 0 ) {
988         shiftCount = countLeadingZeros64( aSig1 ) - 15;
989         if ( shiftCount < 0 ) {
990             *zSig0Ptr = aSig1>>( - shiftCount );
991             *zSig1Ptr = aSig1<<( shiftCount & 63 );
992         }
993         else {
994             *zSig0Ptr = aSig1<<shiftCount;
995             *zSig1Ptr = 0;
996         }
997         *zExpPtr = - shiftCount - 63;
998     }
999     else {
1000         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002         *zExpPtr = 1 - shiftCount;
1003     }
1004 
1005 }
1006 
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result.  After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result.  This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017 | significand.
1018 *----------------------------------------------------------------------------*/
1019 
1020 INLINE float128
1021  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022 {
1023     float128 z;
1024 
1025     z.low = zSig1;
1026     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027     return z;
1028 
1029 }
1030 
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input.  Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly.  However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned.  If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 |     The input significand must be normalized or smaller.  If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding.  In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent.  The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1051 
1052 static float128
1053  roundAndPackFloat128(
1054      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055 {
1056     int8 roundingMode;
1057     flag roundNearestEven, increment, isTiny;
1058 
1059     roundingMode = STATUS(float_rounding_mode);
1060     roundNearestEven = ( roundingMode == float_round_nearest_even );
1061     switch (roundingMode) {
1062     case float_round_nearest_even:
1063     case float_round_ties_away:
1064         increment = ((int64_t)zSig2 < 0);
1065         break;
1066     case float_round_to_zero:
1067         increment = 0;
1068         break;
1069     case float_round_up:
1070         increment = !zSign && zSig2;
1071         break;
1072     case float_round_down:
1073         increment = zSign && zSig2;
1074         break;
1075     default:
1076         abort();
1077     }
1078     if ( 0x7FFD <= (uint32_t) zExp ) {
1079         if (    ( 0x7FFD < zExp )
1080              || (    ( zExp == 0x7FFD )
1081                   && eq128(
1082                          LIT64( 0x0001FFFFFFFFFFFF ),
1083                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1084                          zSig0,
1085                          zSig1
1086                      )
1087                   && increment
1088                 )
1089            ) {
1090             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091             if (    ( roundingMode == float_round_to_zero )
1092                  || ( zSign && ( roundingMode == float_round_up ) )
1093                  || ( ! zSign && ( roundingMode == float_round_down ) )
1094                ) {
1095                 return
1096                     packFloat128(
1097                         zSign,
1098                         0x7FFE,
1099                         LIT64( 0x0000FFFFFFFFFFFF ),
1100                         LIT64( 0xFFFFFFFFFFFFFFFF )
1101                     );
1102             }
1103             return packFloat128( zSign, 0x7FFF, 0, 0 );
1104         }
1105         if ( zExp < 0 ) {
1106             if (STATUS(flush_to_zero)) {
1107                 float_raise(float_flag_output_denormal STATUS_VAR);
1108                 return packFloat128(zSign, 0, 0, 0);
1109             }
1110             isTiny =
1111                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112                 || ( zExp < -1 )
1113                 || ! increment
1114                 || lt128(
1115                        zSig0,
1116                        zSig1,
1117                        LIT64( 0x0001FFFFFFFFFFFF ),
1118                        LIT64( 0xFFFFFFFFFFFFFFFF )
1119                    );
1120             shift128ExtraRightJamming(
1121                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122             zExp = 0;
1123             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124             switch (roundingMode) {
1125             case float_round_nearest_even:
1126             case float_round_ties_away:
1127                 increment = ((int64_t)zSig2 < 0);
1128                 break;
1129             case float_round_to_zero:
1130                 increment = 0;
1131                 break;
1132             case float_round_up:
1133                 increment = !zSign && zSig2;
1134                 break;
1135             case float_round_down:
1136                 increment = zSign && zSig2;
1137                 break;
1138             default:
1139                 abort();
1140             }
1141         }
1142     }
1143     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144     if ( increment ) {
1145         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147     }
1148     else {
1149         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150     }
1151     return packFloat128( zSign, zExp, zSig0, zSig1 );
1152 
1153 }
1154 
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1162 | point exponent.
1163 *----------------------------------------------------------------------------*/
1164 
1165 static float128
1166  normalizeRoundAndPackFloat128(
1167      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168 {
1169     int8 shiftCount;
1170     uint64_t zSig2;
1171 
1172     if ( zSig0 == 0 ) {
1173         zSig0 = zSig1;
1174         zSig1 = 0;
1175         zExp -= 64;
1176     }
1177     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178     if ( 0 <= shiftCount ) {
1179         zSig2 = 0;
1180         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181     }
1182     else {
1183         shift128ExtraRightJamming(
1184             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185     }
1186     zExp -= shiftCount;
1187     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188 
1189 }
1190 
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format.  The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196 
1197 float32 int32_to_float32(int32_t a STATUS_PARAM)
1198 {
1199     flag zSign;
1200 
1201     if ( a == 0 ) return float32_zero;
1202     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203     zSign = ( a < 0 );
1204     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205 
1206 }
1207 
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213 
1214 float64 int32_to_float64(int32_t a STATUS_PARAM)
1215 {
1216     flag zSign;
1217     uint32 absA;
1218     int8 shiftCount;
1219     uint64_t zSig;
1220 
1221     if ( a == 0 ) return float64_zero;
1222     zSign = ( a < 0 );
1223     absA = zSign ? - a : a;
1224     shiftCount = countLeadingZeros32( absA ) + 21;
1225     zSig = absA;
1226     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227 
1228 }
1229 
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format.  The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234 | Arithmetic.
1235 *----------------------------------------------------------------------------*/
1236 
1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238 {
1239     flag zSign;
1240     uint32 absA;
1241     int8 shiftCount;
1242     uint64_t zSig;
1243 
1244     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245     zSign = ( a < 0 );
1246     absA = zSign ? - a : a;
1247     shiftCount = countLeadingZeros32( absA ) + 32;
1248     zSig = absA;
1249     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250 
1251 }
1252 
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format.  The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258 
1259 float128 int32_to_float128(int32_t a STATUS_PARAM)
1260 {
1261     flag zSign;
1262     uint32 absA;
1263     int8 shiftCount;
1264     uint64_t zSig0;
1265 
1266     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267     zSign = ( a < 0 );
1268     absA = zSign ? - a : a;
1269     shiftCount = countLeadingZeros32( absA ) + 17;
1270     zSig0 = absA;
1271     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272 
1273 }
1274 
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format.  The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280 
1281 float32 int64_to_float32(int64_t a STATUS_PARAM)
1282 {
1283     flag zSign;
1284     uint64 absA;
1285     int8 shiftCount;
1286 
1287     if ( a == 0 ) return float32_zero;
1288     zSign = ( a < 0 );
1289     absA = zSign ? - a : a;
1290     shiftCount = countLeadingZeros64( absA ) - 40;
1291     if ( 0 <= shiftCount ) {
1292         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293     }
1294     else {
1295         shiftCount += 7;
1296         if ( shiftCount < 0 ) {
1297             shift64RightJamming( absA, - shiftCount, &absA );
1298         }
1299         else {
1300             absA <<= shiftCount;
1301         }
1302         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303     }
1304 
1305 }
1306 
1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308 {
1309     int8 shiftCount;
1310 
1311     if ( a == 0 ) return float32_zero;
1312     shiftCount = countLeadingZeros64( a ) - 40;
1313     if ( 0 <= shiftCount ) {
1314         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315     }
1316     else {
1317         shiftCount += 7;
1318         if ( shiftCount < 0 ) {
1319             shift64RightJamming( a, - shiftCount, &a );
1320         }
1321         else {
1322             a <<= shiftCount;
1323         }
1324         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325     }
1326 }
1327 
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format.  The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1333 
1334 float64 int64_to_float64(int64_t a STATUS_PARAM)
1335 {
1336     flag zSign;
1337 
1338     if ( a == 0 ) return float64_zero;
1339     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340         return packFloat64( 1, 0x43E, 0 );
1341     }
1342     zSign = ( a < 0 );
1343     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344 
1345 }
1346 
1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348 {
1349     int exp =  0x43C;
1350 
1351     if (a == 0) {
1352         return float64_zero;
1353     }
1354     if ((int64_t)a < 0) {
1355         shift64RightJamming(a, 1, &a);
1356         exp += 1;
1357     }
1358     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359 }
1360 
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format.  The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic.
1366 *----------------------------------------------------------------------------*/
1367 
1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369 {
1370     flag zSign;
1371     uint64 absA;
1372     int8 shiftCount;
1373 
1374     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375     zSign = ( a < 0 );
1376     absA = zSign ? - a : a;
1377     shiftCount = countLeadingZeros64( absA );
1378     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379 
1380 }
1381 
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format.  The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1387 
1388 float128 int64_to_float128(int64_t a STATUS_PARAM)
1389 {
1390     flag zSign;
1391     uint64 absA;
1392     int8 shiftCount;
1393     int32 zExp;
1394     uint64_t zSig0, zSig1;
1395 
1396     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397     zSign = ( a < 0 );
1398     absA = zSign ? - a : a;
1399     shiftCount = countLeadingZeros64( absA ) + 49;
1400     zExp = 0x406E - shiftCount;
1401     if ( 64 <= shiftCount ) {
1402         zSig1 = 0;
1403         zSig0 = absA;
1404         shiftCount -= 64;
1405     }
1406     else {
1407         zSig1 = absA;
1408         zSig0 = 0;
1409     }
1410     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411     return packFloat128( zSign, zExp, zSig0, zSig1 );
1412 
1413 }
1414 
1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416 {
1417     if (a == 0) {
1418         return float128_zero;
1419     }
1420     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421 }
1422 
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format.  The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode.  If `a' is a NaN, the largest
1429 | positive integer is returned.  Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1432 
1433 int32 float32_to_int32( float32 a STATUS_PARAM )
1434 {
1435     flag aSign;
1436     int_fast16_t aExp, shiftCount;
1437     uint32_t aSig;
1438     uint64_t aSig64;
1439 
1440     a = float32_squash_input_denormal(a STATUS_VAR);
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445     if ( aExp ) aSig |= 0x00800000;
1446     shiftCount = 0xAF - aExp;
1447     aSig64 = aSig;
1448     aSig64 <<= 32;
1449     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451 
1452 }
1453 
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format.  The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1461 | returned.
1462 *----------------------------------------------------------------------------*/
1463 
1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465 {
1466     flag aSign;
1467     int_fast16_t aExp, shiftCount;
1468     uint32_t aSig;
1469     int32_t z;
1470     a = float32_squash_input_denormal(a STATUS_VAR);
1471 
1472     aSig = extractFloat32Frac( a );
1473     aExp = extractFloat32Exp( a );
1474     aSign = extractFloat32Sign( a );
1475     shiftCount = aExp - 0x9E;
1476     if ( 0 <= shiftCount ) {
1477         if ( float32_val(a) != 0xCF000000 ) {
1478             float_raise( float_flag_invalid STATUS_VAR);
1479             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480         }
1481         return (int32_t) 0x80000000;
1482     }
1483     else if ( aExp <= 0x7E ) {
1484         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485         return 0;
1486     }
1487     aSig = ( aSig | 0x00800000 )<<8;
1488     z = aSig>>( - shiftCount );
1489     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490         STATUS(float_exception_flags) |= float_flag_inexact;
1491     }
1492     if ( aSign ) z = - z;
1493     return z;
1494 
1495 }
1496 
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format.  The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1504 | returned.
1505 *----------------------------------------------------------------------------*/
1506 
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508 {
1509     flag aSign;
1510     int_fast16_t aExp, shiftCount;
1511     uint32_t aSig;
1512     int32 z;
1513 
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0x8E;
1518     if ( 0 <= shiftCount ) {
1519         if ( float32_val(a) != 0xC7000000 ) {
1520             float_raise( float_flag_invalid STATUS_VAR);
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return 0x7FFF;
1523             }
1524         }
1525         return (int32_t) 0xffff8000;
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) {
1529             STATUS(float_exception_flags) |= float_flag_inexact;
1530         }
1531         return 0;
1532     }
1533     shiftCount -= 0x10;
1534     aSig = ( aSig | 0x00800000 )<<8;
1535     z = aSig>>( - shiftCount );
1536     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537         STATUS(float_exception_flags) |= float_flag_inexact;
1538     }
1539     if ( aSign ) {
1540         z = - z;
1541     }
1542     return z;
1543 
1544 }
1545 
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode.  If `a' is a NaN, the largest
1552 | positive integer is returned.  Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1555 
1556 int64 float32_to_int64( float32 a STATUS_PARAM )
1557 {
1558     flag aSign;
1559     int_fast16_t aExp, shiftCount;
1560     uint32_t aSig;
1561     uint64_t aSig64, aSigExtra;
1562     a = float32_squash_input_denormal(a STATUS_VAR);
1563 
1564     aSig = extractFloat32Frac( a );
1565     aExp = extractFloat32Exp( a );
1566     aSign = extractFloat32Sign( a );
1567     shiftCount = 0xBE - aExp;
1568     if ( shiftCount < 0 ) {
1569         float_raise( float_flag_invalid STATUS_VAR);
1570         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571             return LIT64( 0x7FFFFFFFFFFFFFFF );
1572         }
1573         return (int64_t) LIT64( 0x8000000000000000 );
1574     }
1575     if ( aExp ) aSig |= 0x00800000;
1576     aSig64 = aSig;
1577     aSig64 <<= 40;
1578     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580 
1581 }
1582 
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format.  The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode.  If `a' is a NaN, the largest
1589 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned.  If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1594 
1595 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596 {
1597     flag aSign;
1598     int_fast16_t aExp, shiftCount;
1599     uint32_t aSig;
1600     uint64_t aSig64, aSigExtra;
1601     a = float32_squash_input_denormal(a STATUS_VAR);
1602 
1603     aSig = extractFloat32Frac(a);
1604     aExp = extractFloat32Exp(a);
1605     aSign = extractFloat32Sign(a);
1606     if ((aSign) && (aExp > 126)) {
1607         float_raise(float_flag_invalid STATUS_VAR);
1608         if (float32_is_any_nan(a)) {
1609             return LIT64(0xFFFFFFFFFFFFFFFF);
1610         } else {
1611             return 0;
1612         }
1613     }
1614     shiftCount = 0xBE - aExp;
1615     if (aExp) {
1616         aSig |= 0x00800000;
1617     }
1618     if (shiftCount < 0) {
1619         float_raise(float_flag_invalid STATUS_VAR);
1620         return LIT64(0xFFFFFFFFFFFFFFFF);
1621     }
1622 
1623     aSig64 = aSig;
1624     aSig64 <<= 40;
1625     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627 }
1628 
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit unsigned integer format.  The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero.  If
1634 | `a' is a NaN, the largest unsigned integer is returned.  Otherwise, if the
1635 | conversion overflows, the largest unsigned integer is returned.  If the
1636 | 'a' is negative, the result is rounded and zero is returned; values that do
1637 | not round to zero will raise the inexact flag.
1638 *----------------------------------------------------------------------------*/
1639 
1640 uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1641 {
1642     signed char current_rounding_mode = STATUS(float_rounding_mode);
1643     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1644     int64_t v = float32_to_uint64(a STATUS_VAR);
1645     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1646     return v;
1647 }
1648 
1649 /*----------------------------------------------------------------------------
1650 | Returns the result of converting the single-precision floating-point value
1651 | `a' to the 64-bit two's complement integer format.  The conversion is
1652 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1653 | Arithmetic, except that the conversion is always rounded toward zero.  If
1654 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1655 | conversion overflows, the largest integer with the same sign as `a' is
1656 | returned.
1657 *----------------------------------------------------------------------------*/
1658 
1659 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1660 {
1661     flag aSign;
1662     int_fast16_t aExp, shiftCount;
1663     uint32_t aSig;
1664     uint64_t aSig64;
1665     int64 z;
1666     a = float32_squash_input_denormal(a STATUS_VAR);
1667 
1668     aSig = extractFloat32Frac( a );
1669     aExp = extractFloat32Exp( a );
1670     aSign = extractFloat32Sign( a );
1671     shiftCount = aExp - 0xBE;
1672     if ( 0 <= shiftCount ) {
1673         if ( float32_val(a) != 0xDF000000 ) {
1674             float_raise( float_flag_invalid STATUS_VAR);
1675             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1676                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1677             }
1678         }
1679         return (int64_t) LIT64( 0x8000000000000000 );
1680     }
1681     else if ( aExp <= 0x7E ) {
1682         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1683         return 0;
1684     }
1685     aSig64 = aSig | 0x00800000;
1686     aSig64 <<= 40;
1687     z = aSig64>>( - shiftCount );
1688     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1689         STATUS(float_exception_flags) |= float_flag_inexact;
1690     }
1691     if ( aSign ) z = - z;
1692     return z;
1693 
1694 }
1695 
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the double-precision floating-point format.  The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702 
1703 float64 float32_to_float64( float32 a STATUS_PARAM )
1704 {
1705     flag aSign;
1706     int_fast16_t aExp;
1707     uint32_t aSig;
1708     a = float32_squash_input_denormal(a STATUS_VAR);
1709 
1710     aSig = extractFloat32Frac( a );
1711     aExp = extractFloat32Exp( a );
1712     aSign = extractFloat32Sign( a );
1713     if ( aExp == 0xFF ) {
1714         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1715         return packFloat64( aSign, 0x7FF, 0 );
1716     }
1717     if ( aExp == 0 ) {
1718         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1719         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1720         --aExp;
1721     }
1722     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1723 
1724 }
1725 
1726 /*----------------------------------------------------------------------------
1727 | Returns the result of converting the single-precision floating-point value
1728 | `a' to the extended double-precision floating-point format.  The conversion
1729 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1730 | Arithmetic.
1731 *----------------------------------------------------------------------------*/
1732 
1733 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1734 {
1735     flag aSign;
1736     int_fast16_t aExp;
1737     uint32_t aSig;
1738 
1739     a = float32_squash_input_denormal(a STATUS_VAR);
1740     aSig = extractFloat32Frac( a );
1741     aExp = extractFloat32Exp( a );
1742     aSign = extractFloat32Sign( a );
1743     if ( aExp == 0xFF ) {
1744         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1745         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1746     }
1747     if ( aExp == 0 ) {
1748         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1749         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1750     }
1751     aSig |= 0x00800000;
1752     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1753 
1754 }
1755 
1756 /*----------------------------------------------------------------------------
1757 | Returns the result of converting the single-precision floating-point value
1758 | `a' to the double-precision floating-point format.  The conversion is
1759 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1760 | Arithmetic.
1761 *----------------------------------------------------------------------------*/
1762 
1763 float128 float32_to_float128( float32 a STATUS_PARAM )
1764 {
1765     flag aSign;
1766     int_fast16_t aExp;
1767     uint32_t aSig;
1768 
1769     a = float32_squash_input_denormal(a STATUS_VAR);
1770     aSig = extractFloat32Frac( a );
1771     aExp = extractFloat32Exp( a );
1772     aSign = extractFloat32Sign( a );
1773     if ( aExp == 0xFF ) {
1774         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1775         return packFloat128( aSign, 0x7FFF, 0, 0 );
1776     }
1777     if ( aExp == 0 ) {
1778         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1779         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1780         --aExp;
1781     }
1782     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1783 
1784 }
1785 
1786 /*----------------------------------------------------------------------------
1787 | Rounds the single-precision floating-point value `a' to an integer, and
1788 | returns the result as a single-precision floating-point value.  The
1789 | operation is performed according to the IEC/IEEE Standard for Binary
1790 | Floating-Point Arithmetic.
1791 *----------------------------------------------------------------------------*/
1792 
1793 float32 float32_round_to_int( float32 a STATUS_PARAM)
1794 {
1795     flag aSign;
1796     int_fast16_t aExp;
1797     uint32_t lastBitMask, roundBitsMask;
1798     uint32_t z;
1799     a = float32_squash_input_denormal(a STATUS_VAR);
1800 
1801     aExp = extractFloat32Exp( a );
1802     if ( 0x96 <= aExp ) {
1803         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1804             return propagateFloat32NaN( a, a STATUS_VAR );
1805         }
1806         return a;
1807     }
1808     if ( aExp <= 0x7E ) {
1809         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1810         STATUS(float_exception_flags) |= float_flag_inexact;
1811         aSign = extractFloat32Sign( a );
1812         switch ( STATUS(float_rounding_mode) ) {
1813          case float_round_nearest_even:
1814             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1815                 return packFloat32( aSign, 0x7F, 0 );
1816             }
1817             break;
1818         case float_round_ties_away:
1819             if (aExp == 0x7E) {
1820                 return packFloat32(aSign, 0x7F, 0);
1821             }
1822             break;
1823          case float_round_down:
1824             return make_float32(aSign ? 0xBF800000 : 0);
1825          case float_round_up:
1826             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1827         }
1828         return packFloat32( aSign, 0, 0 );
1829     }
1830     lastBitMask = 1;
1831     lastBitMask <<= 0x96 - aExp;
1832     roundBitsMask = lastBitMask - 1;
1833     z = float32_val(a);
1834     switch (STATUS(float_rounding_mode)) {
1835     case float_round_nearest_even:
1836         z += lastBitMask>>1;
1837         if ((z & roundBitsMask) == 0) {
1838             z &= ~lastBitMask;
1839         }
1840         break;
1841     case float_round_ties_away:
1842         z += lastBitMask >> 1;
1843         break;
1844     case float_round_to_zero:
1845         break;
1846     case float_round_up:
1847         if (!extractFloat32Sign(make_float32(z))) {
1848             z += roundBitsMask;
1849         }
1850         break;
1851     case float_round_down:
1852         if (extractFloat32Sign(make_float32(z))) {
1853             z += roundBitsMask;
1854         }
1855         break;
1856     default:
1857         abort();
1858     }
1859     z &= ~ roundBitsMask;
1860     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1861     return make_float32(z);
1862 
1863 }
1864 
1865 /*----------------------------------------------------------------------------
1866 | Returns the result of adding the absolute values of the single-precision
1867 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1868 | before being returned.  `zSign' is ignored if the result is a NaN.
1869 | The addition is performed according to the IEC/IEEE Standard for Binary
1870 | Floating-Point Arithmetic.
1871 *----------------------------------------------------------------------------*/
1872 
1873 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1874 {
1875     int_fast16_t aExp, bExp, zExp;
1876     uint32_t aSig, bSig, zSig;
1877     int_fast16_t expDiff;
1878 
1879     aSig = extractFloat32Frac( a );
1880     aExp = extractFloat32Exp( a );
1881     bSig = extractFloat32Frac( b );
1882     bExp = extractFloat32Exp( b );
1883     expDiff = aExp - bExp;
1884     aSig <<= 6;
1885     bSig <<= 6;
1886     if ( 0 < expDiff ) {
1887         if ( aExp == 0xFF ) {
1888             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1889             return a;
1890         }
1891         if ( bExp == 0 ) {
1892             --expDiff;
1893         }
1894         else {
1895             bSig |= 0x20000000;
1896         }
1897         shift32RightJamming( bSig, expDiff, &bSig );
1898         zExp = aExp;
1899     }
1900     else if ( expDiff < 0 ) {
1901         if ( bExp == 0xFF ) {
1902             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1903             return packFloat32( zSign, 0xFF, 0 );
1904         }
1905         if ( aExp == 0 ) {
1906             ++expDiff;
1907         }
1908         else {
1909             aSig |= 0x20000000;
1910         }
1911         shift32RightJamming( aSig, - expDiff, &aSig );
1912         zExp = bExp;
1913     }
1914     else {
1915         if ( aExp == 0xFF ) {
1916             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1917             return a;
1918         }
1919         if ( aExp == 0 ) {
1920             if (STATUS(flush_to_zero)) {
1921                 if (aSig | bSig) {
1922                     float_raise(float_flag_output_denormal STATUS_VAR);
1923                 }
1924                 return packFloat32(zSign, 0, 0);
1925             }
1926             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1927         }
1928         zSig = 0x40000000 + aSig + bSig;
1929         zExp = aExp;
1930         goto roundAndPack;
1931     }
1932     aSig |= 0x20000000;
1933     zSig = ( aSig + bSig )<<1;
1934     --zExp;
1935     if ( (int32_t) zSig < 0 ) {
1936         zSig = aSig + bSig;
1937         ++zExp;
1938     }
1939  roundAndPack:
1940     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1941 
1942 }
1943 
1944 /*----------------------------------------------------------------------------
1945 | Returns the result of subtracting the absolute values of the single-
1946 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1947 | difference is negated before being returned.  `zSign' is ignored if the
1948 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1949 | Standard for Binary Floating-Point Arithmetic.
1950 *----------------------------------------------------------------------------*/
1951 
1952 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1953 {
1954     int_fast16_t aExp, bExp, zExp;
1955     uint32_t aSig, bSig, zSig;
1956     int_fast16_t expDiff;
1957 
1958     aSig = extractFloat32Frac( a );
1959     aExp = extractFloat32Exp( a );
1960     bSig = extractFloat32Frac( b );
1961     bExp = extractFloat32Exp( b );
1962     expDiff = aExp - bExp;
1963     aSig <<= 7;
1964     bSig <<= 7;
1965     if ( 0 < expDiff ) goto aExpBigger;
1966     if ( expDiff < 0 ) goto bExpBigger;
1967     if ( aExp == 0xFF ) {
1968         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1969         float_raise( float_flag_invalid STATUS_VAR);
1970         return float32_default_nan;
1971     }
1972     if ( aExp == 0 ) {
1973         aExp = 1;
1974         bExp = 1;
1975     }
1976     if ( bSig < aSig ) goto aBigger;
1977     if ( aSig < bSig ) goto bBigger;
1978     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1979  bExpBigger:
1980     if ( bExp == 0xFF ) {
1981         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1982         return packFloat32( zSign ^ 1, 0xFF, 0 );
1983     }
1984     if ( aExp == 0 ) {
1985         ++expDiff;
1986     }
1987     else {
1988         aSig |= 0x40000000;
1989     }
1990     shift32RightJamming( aSig, - expDiff, &aSig );
1991     bSig |= 0x40000000;
1992  bBigger:
1993     zSig = bSig - aSig;
1994     zExp = bExp;
1995     zSign ^= 1;
1996     goto normalizeRoundAndPack;
1997  aExpBigger:
1998     if ( aExp == 0xFF ) {
1999         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2000         return a;
2001     }
2002     if ( bExp == 0 ) {
2003         --expDiff;
2004     }
2005     else {
2006         bSig |= 0x40000000;
2007     }
2008     shift32RightJamming( bSig, expDiff, &bSig );
2009     aSig |= 0x40000000;
2010  aBigger:
2011     zSig = aSig - bSig;
2012     zExp = aExp;
2013  normalizeRoundAndPack:
2014     --zExp;
2015     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2016 
2017 }
2018 
2019 /*----------------------------------------------------------------------------
2020 | Returns the result of adding the single-precision floating-point values `a'
2021 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2022 | Binary Floating-Point Arithmetic.
2023 *----------------------------------------------------------------------------*/
2024 
2025 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2026 {
2027     flag aSign, bSign;
2028     a = float32_squash_input_denormal(a STATUS_VAR);
2029     b = float32_squash_input_denormal(b STATUS_VAR);
2030 
2031     aSign = extractFloat32Sign( a );
2032     bSign = extractFloat32Sign( b );
2033     if ( aSign == bSign ) {
2034         return addFloat32Sigs( a, b, aSign STATUS_VAR);
2035     }
2036     else {
2037         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038     }
2039 
2040 }
2041 
2042 /*----------------------------------------------------------------------------
2043 | Returns the result of subtracting the single-precision floating-point values
2044 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2045 | for Binary Floating-Point Arithmetic.
2046 *----------------------------------------------------------------------------*/
2047 
2048 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2049 {
2050     flag aSign, bSign;
2051     a = float32_squash_input_denormal(a STATUS_VAR);
2052     b = float32_squash_input_denormal(b STATUS_VAR);
2053 
2054     aSign = extractFloat32Sign( a );
2055     bSign = extractFloat32Sign( b );
2056     if ( aSign == bSign ) {
2057         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2058     }
2059     else {
2060         return addFloat32Sigs( a, b, aSign STATUS_VAR );
2061     }
2062 
2063 }
2064 
2065 /*----------------------------------------------------------------------------
2066 | Returns the result of multiplying the single-precision floating-point values
2067 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2068 | for Binary Floating-Point Arithmetic.
2069 *----------------------------------------------------------------------------*/
2070 
2071 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2072 {
2073     flag aSign, bSign, zSign;
2074     int_fast16_t aExp, bExp, zExp;
2075     uint32_t aSig, bSig;
2076     uint64_t zSig64;
2077     uint32_t zSig;
2078 
2079     a = float32_squash_input_denormal(a STATUS_VAR);
2080     b = float32_squash_input_denormal(b STATUS_VAR);
2081 
2082     aSig = extractFloat32Frac( a );
2083     aExp = extractFloat32Exp( a );
2084     aSign = extractFloat32Sign( a );
2085     bSig = extractFloat32Frac( b );
2086     bExp = extractFloat32Exp( b );
2087     bSign = extractFloat32Sign( b );
2088     zSign = aSign ^ bSign;
2089     if ( aExp == 0xFF ) {
2090         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2091             return propagateFloat32NaN( a, b STATUS_VAR );
2092         }
2093         if ( ( bExp | bSig ) == 0 ) {
2094             float_raise( float_flag_invalid STATUS_VAR);
2095             return float32_default_nan;
2096         }
2097         return packFloat32( zSign, 0xFF, 0 );
2098     }
2099     if ( bExp == 0xFF ) {
2100         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2101         if ( ( aExp | aSig ) == 0 ) {
2102             float_raise( float_flag_invalid STATUS_VAR);
2103             return float32_default_nan;
2104         }
2105         return packFloat32( zSign, 0xFF, 0 );
2106     }
2107     if ( aExp == 0 ) {
2108         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2109         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2110     }
2111     if ( bExp == 0 ) {
2112         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2113         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2114     }
2115     zExp = aExp + bExp - 0x7F;
2116     aSig = ( aSig | 0x00800000 )<<7;
2117     bSig = ( bSig | 0x00800000 )<<8;
2118     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2119     zSig = zSig64;
2120     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2121         zSig <<= 1;
2122         --zExp;
2123     }
2124     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2125 
2126 }
2127 
2128 /*----------------------------------------------------------------------------
2129 | Returns the result of dividing the single-precision floating-point value `a'
2130 | by the corresponding value `b'.  The operation is performed according to the
2131 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2132 *----------------------------------------------------------------------------*/
2133 
2134 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2135 {
2136     flag aSign, bSign, zSign;
2137     int_fast16_t aExp, bExp, zExp;
2138     uint32_t aSig, bSig, zSig;
2139     a = float32_squash_input_denormal(a STATUS_VAR);
2140     b = float32_squash_input_denormal(b STATUS_VAR);
2141 
2142     aSig = extractFloat32Frac( a );
2143     aExp = extractFloat32Exp( a );
2144     aSign = extractFloat32Sign( a );
2145     bSig = extractFloat32Frac( b );
2146     bExp = extractFloat32Exp( b );
2147     bSign = extractFloat32Sign( b );
2148     zSign = aSign ^ bSign;
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2151         if ( bExp == 0xFF ) {
2152             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2153             float_raise( float_flag_invalid STATUS_VAR);
2154             return float32_default_nan;
2155         }
2156         return packFloat32( zSign, 0xFF, 0 );
2157     }
2158     if ( bExp == 0xFF ) {
2159         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2160         return packFloat32( zSign, 0, 0 );
2161     }
2162     if ( bExp == 0 ) {
2163         if ( bSig == 0 ) {
2164             if ( ( aExp | aSig ) == 0 ) {
2165                 float_raise( float_flag_invalid STATUS_VAR);
2166                 return float32_default_nan;
2167             }
2168             float_raise( float_flag_divbyzero STATUS_VAR);
2169             return packFloat32( zSign, 0xFF, 0 );
2170         }
2171         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2172     }
2173     if ( aExp == 0 ) {
2174         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2175         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2176     }
2177     zExp = aExp - bExp + 0x7D;
2178     aSig = ( aSig | 0x00800000 )<<7;
2179     bSig = ( bSig | 0x00800000 )<<8;
2180     if ( bSig <= ( aSig + aSig ) ) {
2181         aSig >>= 1;
2182         ++zExp;
2183     }
2184     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2185     if ( ( zSig & 0x3F ) == 0 ) {
2186         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2187     }
2188     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2189 
2190 }
2191 
2192 /*----------------------------------------------------------------------------
2193 | Returns the remainder of the single-precision floating-point value `a'
2194 | with respect to the corresponding value `b'.  The operation is performed
2195 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2196 *----------------------------------------------------------------------------*/
2197 
2198 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2199 {
2200     flag aSign, zSign;
2201     int_fast16_t aExp, bExp, expDiff;
2202     uint32_t aSig, bSig;
2203     uint32_t q;
2204     uint64_t aSig64, bSig64, q64;
2205     uint32_t alternateASig;
2206     int32_t sigMean;
2207     a = float32_squash_input_denormal(a STATUS_VAR);
2208     b = float32_squash_input_denormal(b STATUS_VAR);
2209 
2210     aSig = extractFloat32Frac( a );
2211     aExp = extractFloat32Exp( a );
2212     aSign = extractFloat32Sign( a );
2213     bSig = extractFloat32Frac( b );
2214     bExp = extractFloat32Exp( b );
2215     if ( aExp == 0xFF ) {
2216         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2217             return propagateFloat32NaN( a, b STATUS_VAR );
2218         }
2219         float_raise( float_flag_invalid STATUS_VAR);
2220         return float32_default_nan;
2221     }
2222     if ( bExp == 0xFF ) {
2223         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2224         return a;
2225     }
2226     if ( bExp == 0 ) {
2227         if ( bSig == 0 ) {
2228             float_raise( float_flag_invalid STATUS_VAR);
2229             return float32_default_nan;
2230         }
2231         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2232     }
2233     if ( aExp == 0 ) {
2234         if ( aSig == 0 ) return a;
2235         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2236     }
2237     expDiff = aExp - bExp;
2238     aSig |= 0x00800000;
2239     bSig |= 0x00800000;
2240     if ( expDiff < 32 ) {
2241         aSig <<= 8;
2242         bSig <<= 8;
2243         if ( expDiff < 0 ) {
2244             if ( expDiff < -1 ) return a;
2245             aSig >>= 1;
2246         }
2247         q = ( bSig <= aSig );
2248         if ( q ) aSig -= bSig;
2249         if ( 0 < expDiff ) {
2250             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2251             q >>= 32 - expDiff;
2252             bSig >>= 2;
2253             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2254         }
2255         else {
2256             aSig >>= 2;
2257             bSig >>= 2;
2258         }
2259     }
2260     else {
2261         if ( bSig <= aSig ) aSig -= bSig;
2262         aSig64 = ( (uint64_t) aSig )<<40;
2263         bSig64 = ( (uint64_t) bSig )<<40;
2264         expDiff -= 64;
2265         while ( 0 < expDiff ) {
2266             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2267             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2268             aSig64 = - ( ( bSig * q64 )<<38 );
2269             expDiff -= 62;
2270         }
2271         expDiff += 64;
2272         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2273         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2274         q = q64>>( 64 - expDiff );
2275         bSig <<= 6;
2276         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2277     }
2278     do {
2279         alternateASig = aSig;
2280         ++q;
2281         aSig -= bSig;
2282     } while ( 0 <= (int32_t) aSig );
2283     sigMean = aSig + alternateASig;
2284     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2285         aSig = alternateASig;
2286     }
2287     zSign = ( (int32_t) aSig < 0 );
2288     if ( zSign ) aSig = - aSig;
2289     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2290 
2291 }
2292 
2293 /*----------------------------------------------------------------------------
2294 | Returns the result of multiplying the single-precision floating-point values
2295 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2296 | multiplication.  The operation is performed according to the IEC/IEEE
2297 | Standard for Binary Floating-Point Arithmetic 754-2008.
2298 | The flags argument allows the caller to select negation of the
2299 | addend, the intermediate product, or the final result. (The difference
2300 | between this and having the caller do a separate negation is that negating
2301 | externally will flip the sign bit on NaNs.)
2302 *----------------------------------------------------------------------------*/
2303 
2304 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2305 {
2306     flag aSign, bSign, cSign, zSign;
2307     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2308     uint32_t aSig, bSig, cSig;
2309     flag pInf, pZero, pSign;
2310     uint64_t pSig64, cSig64, zSig64;
2311     uint32_t pSig;
2312     int shiftcount;
2313     flag signflip, infzero;
2314 
2315     a = float32_squash_input_denormal(a STATUS_VAR);
2316     b = float32_squash_input_denormal(b STATUS_VAR);
2317     c = float32_squash_input_denormal(c STATUS_VAR);
2318     aSig = extractFloat32Frac(a);
2319     aExp = extractFloat32Exp(a);
2320     aSign = extractFloat32Sign(a);
2321     bSig = extractFloat32Frac(b);
2322     bExp = extractFloat32Exp(b);
2323     bSign = extractFloat32Sign(b);
2324     cSig = extractFloat32Frac(c);
2325     cExp = extractFloat32Exp(c);
2326     cSign = extractFloat32Sign(c);
2327 
2328     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2329                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2330 
2331     /* It is implementation-defined whether the cases of (0,inf,qnan)
2332      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2333      * they return if they do), so we have to hand this information
2334      * off to the target-specific pick-a-NaN routine.
2335      */
2336     if (((aExp == 0xff) && aSig) ||
2337         ((bExp == 0xff) && bSig) ||
2338         ((cExp == 0xff) && cSig)) {
2339         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2340     }
2341 
2342     if (infzero) {
2343         float_raise(float_flag_invalid STATUS_VAR);
2344         return float32_default_nan;
2345     }
2346 
2347     if (flags & float_muladd_negate_c) {
2348         cSign ^= 1;
2349     }
2350 
2351     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2352 
2353     /* Work out the sign and type of the product */
2354     pSign = aSign ^ bSign;
2355     if (flags & float_muladd_negate_product) {
2356         pSign ^= 1;
2357     }
2358     pInf = (aExp == 0xff) || (bExp == 0xff);
2359     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2360 
2361     if (cExp == 0xff) {
2362         if (pInf && (pSign ^ cSign)) {
2363             /* addition of opposite-signed infinities => InvalidOperation */
2364             float_raise(float_flag_invalid STATUS_VAR);
2365             return float32_default_nan;
2366         }
2367         /* Otherwise generate an infinity of the same sign */
2368         return packFloat32(cSign ^ signflip, 0xff, 0);
2369     }
2370 
2371     if (pInf) {
2372         return packFloat32(pSign ^ signflip, 0xff, 0);
2373     }
2374 
2375     if (pZero) {
2376         if (cExp == 0) {
2377             if (cSig == 0) {
2378                 /* Adding two exact zeroes */
2379                 if (pSign == cSign) {
2380                     zSign = pSign;
2381                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2382                     zSign = 1;
2383                 } else {
2384                     zSign = 0;
2385                 }
2386                 return packFloat32(zSign ^ signflip, 0, 0);
2387             }
2388             /* Exact zero plus a denorm */
2389             if (STATUS(flush_to_zero)) {
2390                 float_raise(float_flag_output_denormal STATUS_VAR);
2391                 return packFloat32(cSign ^ signflip, 0, 0);
2392             }
2393         }
2394         /* Zero plus something non-zero : just return the something */
2395         if (flags & float_muladd_halve_result) {
2396             if (cExp == 0) {
2397                 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2398             }
2399             /* Subtract one to halve, and one again because roundAndPackFloat32
2400              * wants one less than the true exponent.
2401              */
2402             cExp -= 2;
2403             cSig = (cSig | 0x00800000) << 7;
2404             return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2405         }
2406         return packFloat32(cSign ^ signflip, cExp, cSig);
2407     }
2408 
2409     if (aExp == 0) {
2410         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2411     }
2412     if (bExp == 0) {
2413         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2414     }
2415 
2416     /* Calculate the actual result a * b + c */
2417 
2418     /* Multiply first; this is easy. */
2419     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2420      * because we want the true exponent, not the "one-less-than"
2421      * flavour that roundAndPackFloat32() takes.
2422      */
2423     pExp = aExp + bExp - 0x7e;
2424     aSig = (aSig | 0x00800000) << 7;
2425     bSig = (bSig | 0x00800000) << 8;
2426     pSig64 = (uint64_t)aSig * bSig;
2427     if ((int64_t)(pSig64 << 1) >= 0) {
2428         pSig64 <<= 1;
2429         pExp--;
2430     }
2431 
2432     zSign = pSign ^ signflip;
2433 
2434     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2435      * position 62.
2436      */
2437     if (cExp == 0) {
2438         if (!cSig) {
2439             /* Throw out the special case of c being an exact zero now */
2440             shift64RightJamming(pSig64, 32, &pSig64);
2441             pSig = pSig64;
2442             if (flags & float_muladd_halve_result) {
2443                 pExp--;
2444             }
2445             return roundAndPackFloat32(zSign, pExp - 1,
2446                                        pSig STATUS_VAR);
2447         }
2448         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2449     }
2450 
2451     cSig64 = (uint64_t)cSig << (62 - 23);
2452     cSig64 |= LIT64(0x4000000000000000);
2453     expDiff = pExp - cExp;
2454 
2455     if (pSign == cSign) {
2456         /* Addition */
2457         if (expDiff > 0) {
2458             /* scale c to match p */
2459             shift64RightJamming(cSig64, expDiff, &cSig64);
2460             zExp = pExp;
2461         } else if (expDiff < 0) {
2462             /* scale p to match c */
2463             shift64RightJamming(pSig64, -expDiff, &pSig64);
2464             zExp = cExp;
2465         } else {
2466             /* no scaling needed */
2467             zExp = cExp;
2468         }
2469         /* Add significands and make sure explicit bit ends up in posn 62 */
2470         zSig64 = pSig64 + cSig64;
2471         if ((int64_t)zSig64 < 0) {
2472             shift64RightJamming(zSig64, 1, &zSig64);
2473         } else {
2474             zExp--;
2475         }
2476     } else {
2477         /* Subtraction */
2478         if (expDiff > 0) {
2479             shift64RightJamming(cSig64, expDiff, &cSig64);
2480             zSig64 = pSig64 - cSig64;
2481             zExp = pExp;
2482         } else if (expDiff < 0) {
2483             shift64RightJamming(pSig64, -expDiff, &pSig64);
2484             zSig64 = cSig64 - pSig64;
2485             zExp = cExp;
2486             zSign ^= 1;
2487         } else {
2488             zExp = pExp;
2489             if (cSig64 < pSig64) {
2490                 zSig64 = pSig64 - cSig64;
2491             } else if (pSig64 < cSig64) {
2492                 zSig64 = cSig64 - pSig64;
2493                 zSign ^= 1;
2494             } else {
2495                 /* Exact zero */
2496                 zSign = signflip;
2497                 if (STATUS(float_rounding_mode) == float_round_down) {
2498                     zSign ^= 1;
2499                 }
2500                 return packFloat32(zSign, 0, 0);
2501             }
2502         }
2503         --zExp;
2504         /* Normalize to put the explicit bit back into bit 62. */
2505         shiftcount = countLeadingZeros64(zSig64) - 1;
2506         zSig64 <<= shiftcount;
2507         zExp -= shiftcount;
2508     }
2509     if (flags & float_muladd_halve_result) {
2510         zExp--;
2511     }
2512 
2513     shift64RightJamming(zSig64, 32, &zSig64);
2514     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2515 }
2516 
2517 
2518 /*----------------------------------------------------------------------------
2519 | Returns the square root of the single-precision floating-point value `a'.
2520 | The operation is performed according to the IEC/IEEE Standard for Binary
2521 | Floating-Point Arithmetic.
2522 *----------------------------------------------------------------------------*/
2523 
2524 float32 float32_sqrt( float32 a STATUS_PARAM )
2525 {
2526     flag aSign;
2527     int_fast16_t aExp, zExp;
2528     uint32_t aSig, zSig;
2529     uint64_t rem, term;
2530     a = float32_squash_input_denormal(a STATUS_VAR);
2531 
2532     aSig = extractFloat32Frac( a );
2533     aExp = extractFloat32Exp( a );
2534     aSign = extractFloat32Sign( a );
2535     if ( aExp == 0xFF ) {
2536         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2537         if ( ! aSign ) return a;
2538         float_raise( float_flag_invalid STATUS_VAR);
2539         return float32_default_nan;
2540     }
2541     if ( aSign ) {
2542         if ( ( aExp | aSig ) == 0 ) return a;
2543         float_raise( float_flag_invalid STATUS_VAR);
2544         return float32_default_nan;
2545     }
2546     if ( aExp == 0 ) {
2547         if ( aSig == 0 ) return float32_zero;
2548         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2549     }
2550     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2551     aSig = ( aSig | 0x00800000 )<<8;
2552     zSig = estimateSqrt32( aExp, aSig ) + 2;
2553     if ( ( zSig & 0x7F ) <= 5 ) {
2554         if ( zSig < 2 ) {
2555             zSig = 0x7FFFFFFF;
2556             goto roundAndPack;
2557         }
2558         aSig >>= aExp & 1;
2559         term = ( (uint64_t) zSig ) * zSig;
2560         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2561         while ( (int64_t) rem < 0 ) {
2562             --zSig;
2563             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2564         }
2565         zSig |= ( rem != 0 );
2566     }
2567     shift32RightJamming( zSig, 1, &zSig );
2568  roundAndPack:
2569     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2570 
2571 }
2572 
2573 /*----------------------------------------------------------------------------
2574 | Returns the binary exponential of the single-precision floating-point value
2575 | `a'. The operation is performed according to the IEC/IEEE Standard for
2576 | Binary Floating-Point Arithmetic.
2577 |
2578 | Uses the following identities:
2579 |
2580 | 1. -------------------------------------------------------------------------
2581 |      x    x*ln(2)
2582 |     2  = e
2583 |
2584 | 2. -------------------------------------------------------------------------
2585 |                      2     3     4     5           n
2586 |      x        x     x     x     x     x           x
2587 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2588 |               1!    2!    3!    4!    5!          n!
2589 *----------------------------------------------------------------------------*/
2590 
2591 static const float64 float32_exp2_coefficients[15] =
2592 {
2593     const_float64( 0x3ff0000000000000ll ), /*  1 */
2594     const_float64( 0x3fe0000000000000ll ), /*  2 */
2595     const_float64( 0x3fc5555555555555ll ), /*  3 */
2596     const_float64( 0x3fa5555555555555ll ), /*  4 */
2597     const_float64( 0x3f81111111111111ll ), /*  5 */
2598     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2599     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2600     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2601     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2602     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2603     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2604     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2605     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2606     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2607     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2608 };
2609 
2610 float32 float32_exp2( float32 a STATUS_PARAM )
2611 {
2612     flag aSign;
2613     int_fast16_t aExp;
2614     uint32_t aSig;
2615     float64 r, x, xn;
2616     int i;
2617     a = float32_squash_input_denormal(a STATUS_VAR);
2618 
2619     aSig = extractFloat32Frac( a );
2620     aExp = extractFloat32Exp( a );
2621     aSign = extractFloat32Sign( a );
2622 
2623     if ( aExp == 0xFF) {
2624         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2625         return (aSign) ? float32_zero : a;
2626     }
2627     if (aExp == 0) {
2628         if (aSig == 0) return float32_one;
2629     }
2630 
2631     float_raise( float_flag_inexact STATUS_VAR);
2632 
2633     /* ******************************* */
2634     /* using float64 for approximation */
2635     /* ******************************* */
2636     x = float32_to_float64(a STATUS_VAR);
2637     x = float64_mul(x, float64_ln2 STATUS_VAR);
2638 
2639     xn = x;
2640     r = float64_one;
2641     for (i = 0 ; i < 15 ; i++) {
2642         float64 f;
2643 
2644         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2645         r = float64_add(r, f STATUS_VAR);
2646 
2647         xn = float64_mul(xn, x STATUS_VAR);
2648     }
2649 
2650     return float64_to_float32(r, status);
2651 }
2652 
2653 /*----------------------------------------------------------------------------
2654 | Returns the binary log of the single-precision floating-point value `a'.
2655 | The operation is performed according to the IEC/IEEE Standard for Binary
2656 | Floating-Point Arithmetic.
2657 *----------------------------------------------------------------------------*/
2658 float32 float32_log2( float32 a STATUS_PARAM )
2659 {
2660     flag aSign, zSign;
2661     int_fast16_t aExp;
2662     uint32_t aSig, zSig, i;
2663 
2664     a = float32_squash_input_denormal(a STATUS_VAR);
2665     aSig = extractFloat32Frac( a );
2666     aExp = extractFloat32Exp( a );
2667     aSign = extractFloat32Sign( a );
2668 
2669     if ( aExp == 0 ) {
2670         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2671         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2672     }
2673     if ( aSign ) {
2674         float_raise( float_flag_invalid STATUS_VAR);
2675         return float32_default_nan;
2676     }
2677     if ( aExp == 0xFF ) {
2678         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2679         return a;
2680     }
2681 
2682     aExp -= 0x7F;
2683     aSig |= 0x00800000;
2684     zSign = aExp < 0;
2685     zSig = aExp << 23;
2686 
2687     for (i = 1 << 22; i > 0; i >>= 1) {
2688         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2689         if ( aSig & 0x01000000 ) {
2690             aSig >>= 1;
2691             zSig |= i;
2692         }
2693     }
2694 
2695     if ( zSign )
2696         zSig = -zSig;
2697 
2698     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2699 }
2700 
2701 /*----------------------------------------------------------------------------
2702 | Returns 1 if the single-precision floating-point value `a' is equal to
2703 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2704 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2705 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2706 *----------------------------------------------------------------------------*/
2707 
2708 int float32_eq( float32 a, float32 b STATUS_PARAM )
2709 {
2710     uint32_t av, bv;
2711     a = float32_squash_input_denormal(a STATUS_VAR);
2712     b = float32_squash_input_denormal(b STATUS_VAR);
2713 
2714     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2715          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2716        ) {
2717         float_raise( float_flag_invalid STATUS_VAR);
2718         return 0;
2719     }
2720     av = float32_val(a);
2721     bv = float32_val(b);
2722     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2723 }
2724 
2725 /*----------------------------------------------------------------------------
2726 | Returns 1 if the single-precision floating-point value `a' is less than
2727 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2728 | exception is raised if either operand is a NaN.  The comparison is performed
2729 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2730 *----------------------------------------------------------------------------*/
2731 
2732 int float32_le( float32 a, float32 b STATUS_PARAM )
2733 {
2734     flag aSign, bSign;
2735     uint32_t av, bv;
2736     a = float32_squash_input_denormal(a STATUS_VAR);
2737     b = float32_squash_input_denormal(b STATUS_VAR);
2738 
2739     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2740          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2741        ) {
2742         float_raise( float_flag_invalid STATUS_VAR);
2743         return 0;
2744     }
2745     aSign = extractFloat32Sign( a );
2746     bSign = extractFloat32Sign( b );
2747     av = float32_val(a);
2748     bv = float32_val(b);
2749     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2750     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2751 
2752 }
2753 
2754 /*----------------------------------------------------------------------------
2755 | Returns 1 if the single-precision floating-point value `a' is less than
2756 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2757 | raised if either operand is a NaN.  The comparison is performed according
2758 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2759 *----------------------------------------------------------------------------*/
2760 
2761 int float32_lt( float32 a, float32 b STATUS_PARAM )
2762 {
2763     flag aSign, bSign;
2764     uint32_t av, bv;
2765     a = float32_squash_input_denormal(a STATUS_VAR);
2766     b = float32_squash_input_denormal(b STATUS_VAR);
2767 
2768     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2769          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2770        ) {
2771         float_raise( float_flag_invalid STATUS_VAR);
2772         return 0;
2773     }
2774     aSign = extractFloat32Sign( a );
2775     bSign = extractFloat32Sign( b );
2776     av = float32_val(a);
2777     bv = float32_val(b);
2778     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2779     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2780 
2781 }
2782 
2783 /*----------------------------------------------------------------------------
2784 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2785 | be compared, and 0 otherwise.  The invalid exception is raised if either
2786 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2787 | Standard for Binary Floating-Point Arithmetic.
2788 *----------------------------------------------------------------------------*/
2789 
2790 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2791 {
2792     a = float32_squash_input_denormal(a STATUS_VAR);
2793     b = float32_squash_input_denormal(b STATUS_VAR);
2794 
2795     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2796          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2797        ) {
2798         float_raise( float_flag_invalid STATUS_VAR);
2799         return 1;
2800     }
2801     return 0;
2802 }
2803 
2804 /*----------------------------------------------------------------------------
2805 | Returns 1 if the single-precision floating-point value `a' is equal to
2806 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2807 | exception.  The comparison is performed according to the IEC/IEEE Standard
2808 | for Binary Floating-Point Arithmetic.
2809 *----------------------------------------------------------------------------*/
2810 
2811 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2812 {
2813     a = float32_squash_input_denormal(a STATUS_VAR);
2814     b = float32_squash_input_denormal(b STATUS_VAR);
2815 
2816     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2817          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2818        ) {
2819         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2820             float_raise( float_flag_invalid STATUS_VAR);
2821         }
2822         return 0;
2823     }
2824     return ( float32_val(a) == float32_val(b) ) ||
2825             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2826 }
2827 
2828 /*----------------------------------------------------------------------------
2829 | Returns 1 if the single-precision floating-point value `a' is less than or
2830 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2831 | cause an exception.  Otherwise, the comparison is performed according to the
2832 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2833 *----------------------------------------------------------------------------*/
2834 
2835 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2836 {
2837     flag aSign, bSign;
2838     uint32_t av, bv;
2839     a = float32_squash_input_denormal(a STATUS_VAR);
2840     b = float32_squash_input_denormal(b STATUS_VAR);
2841 
2842     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2843          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2844        ) {
2845         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2846             float_raise( float_flag_invalid STATUS_VAR);
2847         }
2848         return 0;
2849     }
2850     aSign = extractFloat32Sign( a );
2851     bSign = extractFloat32Sign( b );
2852     av = float32_val(a);
2853     bv = float32_val(b);
2854     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2855     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2856 
2857 }
2858 
2859 /*----------------------------------------------------------------------------
2860 | Returns 1 if the single-precision floating-point value `a' is less than
2861 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2862 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2863 | Standard for Binary Floating-Point Arithmetic.
2864 *----------------------------------------------------------------------------*/
2865 
2866 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2867 {
2868     flag aSign, bSign;
2869     uint32_t av, bv;
2870     a = float32_squash_input_denormal(a STATUS_VAR);
2871     b = float32_squash_input_denormal(b STATUS_VAR);
2872 
2873     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2874          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2875        ) {
2876         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2877             float_raise( float_flag_invalid STATUS_VAR);
2878         }
2879         return 0;
2880     }
2881     aSign = extractFloat32Sign( a );
2882     bSign = extractFloat32Sign( b );
2883     av = float32_val(a);
2884     bv = float32_val(b);
2885     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2886     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2887 
2888 }
2889 
2890 /*----------------------------------------------------------------------------
2891 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2892 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2893 | comparison is performed according to the IEC/IEEE Standard for Binary
2894 | Floating-Point Arithmetic.
2895 *----------------------------------------------------------------------------*/
2896 
2897 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2898 {
2899     a = float32_squash_input_denormal(a STATUS_VAR);
2900     b = float32_squash_input_denormal(b STATUS_VAR);
2901 
2902     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2903          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2904        ) {
2905         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2906             float_raise( float_flag_invalid STATUS_VAR);
2907         }
2908         return 1;
2909     }
2910     return 0;
2911 }
2912 
2913 /*----------------------------------------------------------------------------
2914 | Returns the result of converting the double-precision floating-point value
2915 | `a' to the 32-bit two's complement integer format.  The conversion is
2916 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2917 | Arithmetic---which means in particular that the conversion is rounded
2918 | according to the current rounding mode.  If `a' is a NaN, the largest
2919 | positive integer is returned.  Otherwise, if the conversion overflows, the
2920 | largest integer with the same sign as `a' is returned.
2921 *----------------------------------------------------------------------------*/
2922 
2923 int32 float64_to_int32( float64 a STATUS_PARAM )
2924 {
2925     flag aSign;
2926     int_fast16_t aExp, shiftCount;
2927     uint64_t aSig;
2928     a = float64_squash_input_denormal(a STATUS_VAR);
2929 
2930     aSig = extractFloat64Frac( a );
2931     aExp = extractFloat64Exp( a );
2932     aSign = extractFloat64Sign( a );
2933     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2934     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2935     shiftCount = 0x42C - aExp;
2936     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2937     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2938 
2939 }
2940 
2941 /*----------------------------------------------------------------------------
2942 | Returns the result of converting the double-precision floating-point value
2943 | `a' to the 32-bit two's complement integer format.  The conversion is
2944 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2945 | Arithmetic, except that the conversion is always rounded toward zero.
2946 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2947 | the conversion overflows, the largest integer with the same sign as `a' is
2948 | returned.
2949 *----------------------------------------------------------------------------*/
2950 
2951 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2952 {
2953     flag aSign;
2954     int_fast16_t aExp, shiftCount;
2955     uint64_t aSig, savedASig;
2956     int32_t z;
2957     a = float64_squash_input_denormal(a STATUS_VAR);
2958 
2959     aSig = extractFloat64Frac( a );
2960     aExp = extractFloat64Exp( a );
2961     aSign = extractFloat64Sign( a );
2962     if ( 0x41E < aExp ) {
2963         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2964         goto invalid;
2965     }
2966     else if ( aExp < 0x3FF ) {
2967         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2968         return 0;
2969     }
2970     aSig |= LIT64( 0x0010000000000000 );
2971     shiftCount = 0x433 - aExp;
2972     savedASig = aSig;
2973     aSig >>= shiftCount;
2974     z = aSig;
2975     if ( aSign ) z = - z;
2976     if ( ( z < 0 ) ^ aSign ) {
2977  invalid:
2978         float_raise( float_flag_invalid STATUS_VAR);
2979         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2980     }
2981     if ( ( aSig<<shiftCount ) != savedASig ) {
2982         STATUS(float_exception_flags) |= float_flag_inexact;
2983     }
2984     return z;
2985 
2986 }
2987 
2988 /*----------------------------------------------------------------------------
2989 | Returns the result of converting the double-precision floating-point value
2990 | `a' to the 16-bit two's complement integer format.  The conversion is
2991 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2992 | Arithmetic, except that the conversion is always rounded toward zero.
2993 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2994 | the conversion overflows, the largest integer with the same sign as `a' is
2995 | returned.
2996 *----------------------------------------------------------------------------*/
2997 
2998 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2999 {
3000     flag aSign;
3001     int_fast16_t aExp, shiftCount;
3002     uint64_t aSig, savedASig;
3003     int32 z;
3004 
3005     aSig = extractFloat64Frac( a );
3006     aExp = extractFloat64Exp( a );
3007     aSign = extractFloat64Sign( a );
3008     if ( 0x40E < aExp ) {
3009         if ( ( aExp == 0x7FF ) && aSig ) {
3010             aSign = 0;
3011         }
3012         goto invalid;
3013     }
3014     else if ( aExp < 0x3FF ) {
3015         if ( aExp || aSig ) {
3016             STATUS(float_exception_flags) |= float_flag_inexact;
3017         }
3018         return 0;
3019     }
3020     aSig |= LIT64( 0x0010000000000000 );
3021     shiftCount = 0x433 - aExp;
3022     savedASig = aSig;
3023     aSig >>= shiftCount;
3024     z = aSig;
3025     if ( aSign ) {
3026         z = - z;
3027     }
3028     if ( ( (int16_t)z < 0 ) ^ aSign ) {
3029  invalid:
3030         float_raise( float_flag_invalid STATUS_VAR);
3031         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3032     }
3033     if ( ( aSig<<shiftCount ) != savedASig ) {
3034         STATUS(float_exception_flags) |= float_flag_inexact;
3035     }
3036     return z;
3037 }
3038 
3039 /*----------------------------------------------------------------------------
3040 | Returns the result of converting the double-precision floating-point value
3041 | `a' to the 64-bit two's complement integer format.  The conversion is
3042 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3043 | Arithmetic---which means in particular that the conversion is rounded
3044 | according to the current rounding mode.  If `a' is a NaN, the largest
3045 | positive integer is returned.  Otherwise, if the conversion overflows, the
3046 | largest integer with the same sign as `a' is returned.
3047 *----------------------------------------------------------------------------*/
3048 
3049 int64 float64_to_int64( float64 a STATUS_PARAM )
3050 {
3051     flag aSign;
3052     int_fast16_t aExp, shiftCount;
3053     uint64_t aSig, aSigExtra;
3054     a = float64_squash_input_denormal(a STATUS_VAR);
3055 
3056     aSig = extractFloat64Frac( a );
3057     aExp = extractFloat64Exp( a );
3058     aSign = extractFloat64Sign( a );
3059     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3060     shiftCount = 0x433 - aExp;
3061     if ( shiftCount <= 0 ) {
3062         if ( 0x43E < aExp ) {
3063             float_raise( float_flag_invalid STATUS_VAR);
3064             if (    ! aSign
3065                  || (    ( aExp == 0x7FF )
3066                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3067                ) {
3068                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3069             }
3070             return (int64_t) LIT64( 0x8000000000000000 );
3071         }
3072         aSigExtra = 0;
3073         aSig <<= - shiftCount;
3074     }
3075     else {
3076         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3077     }
3078     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3079 
3080 }
3081 
3082 /*----------------------------------------------------------------------------
3083 | Returns the result of converting the double-precision floating-point value
3084 | `a' to the 64-bit two's complement integer format.  The conversion is
3085 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3086 | Arithmetic, except that the conversion is always rounded toward zero.
3087 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3088 | the conversion overflows, the largest integer with the same sign as `a' is
3089 | returned.
3090 *----------------------------------------------------------------------------*/
3091 
3092 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3093 {
3094     flag aSign;
3095     int_fast16_t aExp, shiftCount;
3096     uint64_t aSig;
3097     int64 z;
3098     a = float64_squash_input_denormal(a STATUS_VAR);
3099 
3100     aSig = extractFloat64Frac( a );
3101     aExp = extractFloat64Exp( a );
3102     aSign = extractFloat64Sign( a );
3103     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3104     shiftCount = aExp - 0x433;
3105     if ( 0 <= shiftCount ) {
3106         if ( 0x43E <= aExp ) {
3107             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3108                 float_raise( float_flag_invalid STATUS_VAR);
3109                 if (    ! aSign
3110                      || (    ( aExp == 0x7FF )
3111                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3112                    ) {
3113                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3114                 }
3115             }
3116             return (int64_t) LIT64( 0x8000000000000000 );
3117         }
3118         z = aSig<<shiftCount;
3119     }
3120     else {
3121         if ( aExp < 0x3FE ) {
3122             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3123             return 0;
3124         }
3125         z = aSig>>( - shiftCount );
3126         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3127             STATUS(float_exception_flags) |= float_flag_inexact;
3128         }
3129     }
3130     if ( aSign ) z = - z;
3131     return z;
3132 
3133 }
3134 
3135 /*----------------------------------------------------------------------------
3136 | Returns the result of converting the double-precision floating-point value
3137 | `a' to the single-precision floating-point format.  The conversion is
3138 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3139 | Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141 
3142 float32 float64_to_float32( float64 a STATUS_PARAM )
3143 {
3144     flag aSign;
3145     int_fast16_t aExp;
3146     uint64_t aSig;
3147     uint32_t zSig;
3148     a = float64_squash_input_denormal(a STATUS_VAR);
3149 
3150     aSig = extractFloat64Frac( a );
3151     aExp = extractFloat64Exp( a );
3152     aSign = extractFloat64Sign( a );
3153     if ( aExp == 0x7FF ) {
3154         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3155         return packFloat32( aSign, 0xFF, 0 );
3156     }
3157     shift64RightJamming( aSig, 22, &aSig );
3158     zSig = aSig;
3159     if ( aExp || zSig ) {
3160         zSig |= 0x40000000;
3161         aExp -= 0x381;
3162     }
3163     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3164 
3165 }
3166 
3167 
3168 /*----------------------------------------------------------------------------
3169 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3170 | half-precision floating-point value, returning the result.  After being
3171 | shifted into the proper positions, the three fields are simply added
3172 | together to form the result.  This means that any integer portion of `zSig'
3173 | will be added into the exponent.  Since a properly normalized significand
3174 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3175 | than the desired result exponent whenever `zSig' is a complete, normalized
3176 | significand.
3177 *----------------------------------------------------------------------------*/
3178 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3179 {
3180     return make_float16(
3181         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3182 }
3183 
3184 /*----------------------------------------------------------------------------
3185 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3186 | and significand `zSig', and returns the proper half-precision floating-
3187 | point value corresponding to the abstract input.  Ordinarily, the abstract
3188 | value is simply rounded and packed into the half-precision format, with
3189 | the inexact exception raised if the abstract input cannot be represented
3190 | exactly.  However, if the abstract value is too large, the overflow and
3191 | inexact exceptions are raised and an infinity or maximal finite value is
3192 | returned.  If the abstract value is too small, the input value is rounded to
3193 | a subnormal number, and the underflow and inexact exceptions are raised if
3194 | the abstract input cannot be represented exactly as a subnormal half-
3195 | precision floating-point number.
3196 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3197 | ARM-style "alternative representation", which omits the NaN and Inf
3198 | encodings in order to raise the maximum representable exponent by one.
3199 |     The input significand `zSig' has its binary point between bits 22
3200 | and 23, which is 13 bits to the left of the usual location.  This shifted
3201 | significand must be normalized or smaller.  If `zSig' is not normalized,
3202 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3203 | and it must not require rounding.  In the usual case that `zSig' is
3204 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3205 | Note the slightly odd position of the binary point in zSig compared with the
3206 | other roundAndPackFloat functions. This should probably be fixed if we
3207 | need to implement more float16 routines than just conversion.
3208 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3209 | Binary Floating-Point Arithmetic.
3210 *----------------------------------------------------------------------------*/
3211 
3212 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3213                                    uint32_t zSig, flag ieee STATUS_PARAM)
3214 {
3215     int maxexp = ieee ? 29 : 30;
3216     uint32_t mask;
3217     uint32_t increment;
3218     bool rounding_bumps_exp;
3219     bool is_tiny = false;
3220 
3221     /* Calculate the mask of bits of the mantissa which are not
3222      * representable in half-precision and will be lost.
3223      */
3224     if (zExp < 1) {
3225         /* Will be denormal in halfprec */
3226         mask = 0x00ffffff;
3227         if (zExp >= -11) {
3228             mask >>= 11 + zExp;
3229         }
3230     } else {
3231         /* Normal number in halfprec */
3232         mask = 0x00001fff;
3233     }
3234 
3235     switch (STATUS(float_rounding_mode)) {
3236     case float_round_nearest_even:
3237         increment = (mask + 1) >> 1;
3238         if ((zSig & mask) == increment) {
3239             increment = zSig & (increment << 1);
3240         }
3241         break;
3242     case float_round_ties_away:
3243         increment = (mask + 1) >> 1;
3244         break;
3245     case float_round_up:
3246         increment = zSign ? 0 : mask;
3247         break;
3248     case float_round_down:
3249         increment = zSign ? mask : 0;
3250         break;
3251     default: /* round_to_zero */
3252         increment = 0;
3253         break;
3254     }
3255 
3256     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3257 
3258     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3259         if (ieee) {
3260             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3261             return packFloat16(zSign, 0x1f, 0);
3262         } else {
3263             float_raise(float_flag_invalid STATUS_VAR);
3264             return packFloat16(zSign, 0x1f, 0x3ff);
3265         }
3266     }
3267 
3268     if (zExp < 0) {
3269         /* Note that flush-to-zero does not affect half-precision results */
3270         is_tiny =
3271             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3272             || (zExp < -1)
3273             || (!rounding_bumps_exp);
3274     }
3275     if (zSig & mask) {
3276         float_raise(float_flag_inexact STATUS_VAR);
3277         if (is_tiny) {
3278             float_raise(float_flag_underflow STATUS_VAR);
3279         }
3280     }
3281 
3282     zSig += increment;
3283     if (rounding_bumps_exp) {
3284         zSig >>= 1;
3285         zExp++;
3286     }
3287 
3288     if (zExp < -10) {
3289         return packFloat16(zSign, 0, 0);
3290     }
3291     if (zExp < 0) {
3292         zSig >>= -zExp;
3293         zExp = 0;
3294     }
3295     return packFloat16(zSign, zExp, zSig >> 13);
3296 }
3297 
3298 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3299                                       uint32_t *zSigPtr)
3300 {
3301     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3302     *zSigPtr = aSig << shiftCount;
3303     *zExpPtr = 1 - shiftCount;
3304 }
3305 
3306 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3307    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3308 
3309 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3310 {
3311     flag aSign;
3312     int_fast16_t aExp;
3313     uint32_t aSig;
3314 
3315     aSign = extractFloat16Sign(a);
3316     aExp = extractFloat16Exp(a);
3317     aSig = extractFloat16Frac(a);
3318 
3319     if (aExp == 0x1f && ieee) {
3320         if (aSig) {
3321             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3322         }
3323         return packFloat32(aSign, 0xff, 0);
3324     }
3325     if (aExp == 0) {
3326         if (aSig == 0) {
3327             return packFloat32(aSign, 0, 0);
3328         }
3329 
3330         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3331         aExp--;
3332     }
3333     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3334 }
3335 
3336 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3337 {
3338     flag aSign;
3339     int_fast16_t aExp;
3340     uint32_t aSig;
3341 
3342     a = float32_squash_input_denormal(a STATUS_VAR);
3343 
3344     aSig = extractFloat32Frac( a );
3345     aExp = extractFloat32Exp( a );
3346     aSign = extractFloat32Sign( a );
3347     if ( aExp == 0xFF ) {
3348         if (aSig) {
3349             /* Input is a NaN */
3350             if (!ieee) {
3351                 float_raise(float_flag_invalid STATUS_VAR);
3352                 return packFloat16(aSign, 0, 0);
3353             }
3354             return commonNaNToFloat16(
3355                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3356         }
3357         /* Infinity */
3358         if (!ieee) {
3359             float_raise(float_flag_invalid STATUS_VAR);
3360             return packFloat16(aSign, 0x1f, 0x3ff);
3361         }
3362         return packFloat16(aSign, 0x1f, 0);
3363     }
3364     if (aExp == 0 && aSig == 0) {
3365         return packFloat16(aSign, 0, 0);
3366     }
3367     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3368      * even if the input is denormal; however this is harmless because
3369      * the largest possible single-precision denormal is still smaller
3370      * than the smallest representable half-precision denormal, and so we
3371      * will end up ignoring aSig and returning via the "always return zero"
3372      * codepath.
3373      */
3374     aSig |= 0x00800000;
3375     aExp -= 0x71;
3376 
3377     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3378 }
3379 
3380 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3381 {
3382     flag aSign;
3383     int_fast16_t aExp;
3384     uint32_t aSig;
3385 
3386     aSign = extractFloat16Sign(a);
3387     aExp = extractFloat16Exp(a);
3388     aSig = extractFloat16Frac(a);
3389 
3390     if (aExp == 0x1f && ieee) {
3391         if (aSig) {
3392             return commonNaNToFloat64(
3393                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3394         }
3395         return packFloat64(aSign, 0x7ff, 0);
3396     }
3397     if (aExp == 0) {
3398         if (aSig == 0) {
3399             return packFloat64(aSign, 0, 0);
3400         }
3401 
3402         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3403         aExp--;
3404     }
3405     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3406 }
3407 
3408 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3409 {
3410     flag aSign;
3411     int_fast16_t aExp;
3412     uint64_t aSig;
3413     uint32_t zSig;
3414 
3415     a = float64_squash_input_denormal(a STATUS_VAR);
3416 
3417     aSig = extractFloat64Frac(a);
3418     aExp = extractFloat64Exp(a);
3419     aSign = extractFloat64Sign(a);
3420     if (aExp == 0x7FF) {
3421         if (aSig) {
3422             /* Input is a NaN */
3423             if (!ieee) {
3424                 float_raise(float_flag_invalid STATUS_VAR);
3425                 return packFloat16(aSign, 0, 0);
3426             }
3427             return commonNaNToFloat16(
3428                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3429         }
3430         /* Infinity */
3431         if (!ieee) {
3432             float_raise(float_flag_invalid STATUS_VAR);
3433             return packFloat16(aSign, 0x1f, 0x3ff);
3434         }
3435         return packFloat16(aSign, 0x1f, 0);
3436     }
3437     shift64RightJamming(aSig, 29, &aSig);
3438     zSig = aSig;
3439     if (aExp == 0 && zSig == 0) {
3440         return packFloat16(aSign, 0, 0);
3441     }
3442     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3443      * even if the input is denormal; however this is harmless because
3444      * the largest possible single-precision denormal is still smaller
3445      * than the smallest representable half-precision denormal, and so we
3446      * will end up ignoring aSig and returning via the "always return zero"
3447      * codepath.
3448      */
3449     zSig |= 0x00800000;
3450     aExp -= 0x3F1;
3451 
3452     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3453 }
3454 
3455 /*----------------------------------------------------------------------------
3456 | Returns the result of converting the double-precision floating-point value
3457 | `a' to the extended double-precision floating-point format.  The conversion
3458 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3459 | Arithmetic.
3460 *----------------------------------------------------------------------------*/
3461 
3462 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3463 {
3464     flag aSign;
3465     int_fast16_t aExp;
3466     uint64_t aSig;
3467 
3468     a = float64_squash_input_denormal(a STATUS_VAR);
3469     aSig = extractFloat64Frac( a );
3470     aExp = extractFloat64Exp( a );
3471     aSign = extractFloat64Sign( a );
3472     if ( aExp == 0x7FF ) {
3473         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3474         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3475     }
3476     if ( aExp == 0 ) {
3477         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3478         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3479     }
3480     return
3481         packFloatx80(
3482             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3483 
3484 }
3485 
3486 /*----------------------------------------------------------------------------
3487 | Returns the result of converting the double-precision floating-point value
3488 | `a' to the quadruple-precision floating-point format.  The conversion is
3489 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3490 | Arithmetic.
3491 *----------------------------------------------------------------------------*/
3492 
3493 float128 float64_to_float128( float64 a STATUS_PARAM )
3494 {
3495     flag aSign;
3496     int_fast16_t aExp;
3497     uint64_t aSig, zSig0, zSig1;
3498 
3499     a = float64_squash_input_denormal(a STATUS_VAR);
3500     aSig = extractFloat64Frac( a );
3501     aExp = extractFloat64Exp( a );
3502     aSign = extractFloat64Sign( a );
3503     if ( aExp == 0x7FF ) {
3504         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3505         return packFloat128( aSign, 0x7FFF, 0, 0 );
3506     }
3507     if ( aExp == 0 ) {
3508         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3509         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3510         --aExp;
3511     }
3512     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3513     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3514 
3515 }
3516 
3517 /*----------------------------------------------------------------------------
3518 | Rounds the double-precision floating-point value `a' to an integer, and
3519 | returns the result as a double-precision floating-point value.  The
3520 | operation is performed according to the IEC/IEEE Standard for Binary
3521 | Floating-Point Arithmetic.
3522 *----------------------------------------------------------------------------*/
3523 
3524 float64 float64_round_to_int( float64 a STATUS_PARAM )
3525 {
3526     flag aSign;
3527     int_fast16_t aExp;
3528     uint64_t lastBitMask, roundBitsMask;
3529     uint64_t z;
3530     a = float64_squash_input_denormal(a STATUS_VAR);
3531 
3532     aExp = extractFloat64Exp( a );
3533     if ( 0x433 <= aExp ) {
3534         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3535             return propagateFloat64NaN( a, a STATUS_VAR );
3536         }
3537         return a;
3538     }
3539     if ( aExp < 0x3FF ) {
3540         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3541         STATUS(float_exception_flags) |= float_flag_inexact;
3542         aSign = extractFloat64Sign( a );
3543         switch ( STATUS(float_rounding_mode) ) {
3544          case float_round_nearest_even:
3545             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3546                 return packFloat64( aSign, 0x3FF, 0 );
3547             }
3548             break;
3549         case float_round_ties_away:
3550             if (aExp == 0x3FE) {
3551                 return packFloat64(aSign, 0x3ff, 0);
3552             }
3553             break;
3554          case float_round_down:
3555             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3556          case float_round_up:
3557             return make_float64(
3558             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3559         }
3560         return packFloat64( aSign, 0, 0 );
3561     }
3562     lastBitMask = 1;
3563     lastBitMask <<= 0x433 - aExp;
3564     roundBitsMask = lastBitMask - 1;
3565     z = float64_val(a);
3566     switch (STATUS(float_rounding_mode)) {
3567     case float_round_nearest_even:
3568         z += lastBitMask >> 1;
3569         if ((z & roundBitsMask) == 0) {
3570             z &= ~lastBitMask;
3571         }
3572         break;
3573     case float_round_ties_away:
3574         z += lastBitMask >> 1;
3575         break;
3576     case float_round_to_zero:
3577         break;
3578     case float_round_up:
3579         if (!extractFloat64Sign(make_float64(z))) {
3580             z += roundBitsMask;
3581         }
3582         break;
3583     case float_round_down:
3584         if (extractFloat64Sign(make_float64(z))) {
3585             z += roundBitsMask;
3586         }
3587         break;
3588     default:
3589         abort();
3590     }
3591     z &= ~ roundBitsMask;
3592     if ( z != float64_val(a) )
3593         STATUS(float_exception_flags) |= float_flag_inexact;
3594     return make_float64(z);
3595 
3596 }
3597 
3598 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3599 {
3600     int oldmode;
3601     float64 res;
3602     oldmode = STATUS(float_rounding_mode);
3603     STATUS(float_rounding_mode) = float_round_to_zero;
3604     res = float64_round_to_int(a STATUS_VAR);
3605     STATUS(float_rounding_mode) = oldmode;
3606     return res;
3607 }
3608 
3609 /*----------------------------------------------------------------------------
3610 | Returns the result of adding the absolute values of the double-precision
3611 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3612 | before being returned.  `zSign' is ignored if the result is a NaN.
3613 | The addition is performed according to the IEC/IEEE Standard for Binary
3614 | Floating-Point Arithmetic.
3615 *----------------------------------------------------------------------------*/
3616 
3617 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3618 {
3619     int_fast16_t aExp, bExp, zExp;
3620     uint64_t aSig, bSig, zSig;
3621     int_fast16_t expDiff;
3622 
3623     aSig = extractFloat64Frac( a );
3624     aExp = extractFloat64Exp( a );
3625     bSig = extractFloat64Frac( b );
3626     bExp = extractFloat64Exp( b );
3627     expDiff = aExp - bExp;
3628     aSig <<= 9;
3629     bSig <<= 9;
3630     if ( 0 < expDiff ) {
3631         if ( aExp == 0x7FF ) {
3632             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3633             return a;
3634         }
3635         if ( bExp == 0 ) {
3636             --expDiff;
3637         }
3638         else {
3639             bSig |= LIT64( 0x2000000000000000 );
3640         }
3641         shift64RightJamming( bSig, expDiff, &bSig );
3642         zExp = aExp;
3643     }
3644     else if ( expDiff < 0 ) {
3645         if ( bExp == 0x7FF ) {
3646             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3647             return packFloat64( zSign, 0x7FF, 0 );
3648         }
3649         if ( aExp == 0 ) {
3650             ++expDiff;
3651         }
3652         else {
3653             aSig |= LIT64( 0x2000000000000000 );
3654         }
3655         shift64RightJamming( aSig, - expDiff, &aSig );
3656         zExp = bExp;
3657     }
3658     else {
3659         if ( aExp == 0x7FF ) {
3660             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3661             return a;
3662         }
3663         if ( aExp == 0 ) {
3664             if (STATUS(flush_to_zero)) {
3665                 if (aSig | bSig) {
3666                     float_raise(float_flag_output_denormal STATUS_VAR);
3667                 }
3668                 return packFloat64(zSign, 0, 0);
3669             }
3670             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3671         }
3672         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3673         zExp = aExp;
3674         goto roundAndPack;
3675     }
3676     aSig |= LIT64( 0x2000000000000000 );
3677     zSig = ( aSig + bSig )<<1;
3678     --zExp;
3679     if ( (int64_t) zSig < 0 ) {
3680         zSig = aSig + bSig;
3681         ++zExp;
3682     }
3683  roundAndPack:
3684     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3685 
3686 }
3687 
3688 /*----------------------------------------------------------------------------
3689 | Returns the result of subtracting the absolute values of the double-
3690 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3691 | difference is negated before being returned.  `zSign' is ignored if the
3692 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3693 | Standard for Binary Floating-Point Arithmetic.
3694 *----------------------------------------------------------------------------*/
3695 
3696 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3697 {
3698     int_fast16_t aExp, bExp, zExp;
3699     uint64_t aSig, bSig, zSig;
3700     int_fast16_t expDiff;
3701 
3702     aSig = extractFloat64Frac( a );
3703     aExp = extractFloat64Exp( a );
3704     bSig = extractFloat64Frac( b );
3705     bExp = extractFloat64Exp( b );
3706     expDiff = aExp - bExp;
3707     aSig <<= 10;
3708     bSig <<= 10;
3709     if ( 0 < expDiff ) goto aExpBigger;
3710     if ( expDiff < 0 ) goto bExpBigger;
3711     if ( aExp == 0x7FF ) {
3712         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3713         float_raise( float_flag_invalid STATUS_VAR);
3714         return float64_default_nan;
3715     }
3716     if ( aExp == 0 ) {
3717         aExp = 1;
3718         bExp = 1;
3719     }
3720     if ( bSig < aSig ) goto aBigger;
3721     if ( aSig < bSig ) goto bBigger;
3722     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3723  bExpBigger:
3724     if ( bExp == 0x7FF ) {
3725         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3726         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3727     }
3728     if ( aExp == 0 ) {
3729         ++expDiff;
3730     }
3731     else {
3732         aSig |= LIT64( 0x4000000000000000 );
3733     }
3734     shift64RightJamming( aSig, - expDiff, &aSig );
3735     bSig |= LIT64( 0x4000000000000000 );
3736  bBigger:
3737     zSig = bSig - aSig;
3738     zExp = bExp;
3739     zSign ^= 1;
3740     goto normalizeRoundAndPack;
3741  aExpBigger:
3742     if ( aExp == 0x7FF ) {
3743         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3744         return a;
3745     }
3746     if ( bExp == 0 ) {
3747         --expDiff;
3748     }
3749     else {
3750         bSig |= LIT64( 0x4000000000000000 );
3751     }
3752     shift64RightJamming( bSig, expDiff, &bSig );
3753     aSig |= LIT64( 0x4000000000000000 );
3754  aBigger:
3755     zSig = aSig - bSig;
3756     zExp = aExp;
3757  normalizeRoundAndPack:
3758     --zExp;
3759     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3760 
3761 }
3762 
3763 /*----------------------------------------------------------------------------
3764 | Returns the result of adding the double-precision floating-point values `a'
3765 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3766 | Binary Floating-Point Arithmetic.
3767 *----------------------------------------------------------------------------*/
3768 
3769 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3770 {
3771     flag aSign, bSign;
3772     a = float64_squash_input_denormal(a STATUS_VAR);
3773     b = float64_squash_input_denormal(b STATUS_VAR);
3774 
3775     aSign = extractFloat64Sign( a );
3776     bSign = extractFloat64Sign( b );
3777     if ( aSign == bSign ) {
3778         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3779     }
3780     else {
3781         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3782     }
3783 
3784 }
3785 
3786 /*----------------------------------------------------------------------------
3787 | Returns the result of subtracting the double-precision floating-point values
3788 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3789 | for Binary Floating-Point Arithmetic.
3790 *----------------------------------------------------------------------------*/
3791 
3792 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3793 {
3794     flag aSign, bSign;
3795     a = float64_squash_input_denormal(a STATUS_VAR);
3796     b = float64_squash_input_denormal(b STATUS_VAR);
3797 
3798     aSign = extractFloat64Sign( a );
3799     bSign = extractFloat64Sign( b );
3800     if ( aSign == bSign ) {
3801         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3802     }
3803     else {
3804         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3805     }
3806 
3807 }
3808 
3809 /*----------------------------------------------------------------------------
3810 | Returns the result of multiplying the double-precision floating-point values
3811 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3812 | for Binary Floating-Point Arithmetic.
3813 *----------------------------------------------------------------------------*/
3814 
3815 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3816 {
3817     flag aSign, bSign, zSign;
3818     int_fast16_t aExp, bExp, zExp;
3819     uint64_t aSig, bSig, zSig0, zSig1;
3820 
3821     a = float64_squash_input_denormal(a STATUS_VAR);
3822     b = float64_squash_input_denormal(b STATUS_VAR);
3823 
3824     aSig = extractFloat64Frac( a );
3825     aExp = extractFloat64Exp( a );
3826     aSign = extractFloat64Sign( a );
3827     bSig = extractFloat64Frac( b );
3828     bExp = extractFloat64Exp( b );
3829     bSign = extractFloat64Sign( b );
3830     zSign = aSign ^ bSign;
3831     if ( aExp == 0x7FF ) {
3832         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3833             return propagateFloat64NaN( a, b STATUS_VAR );
3834         }
3835         if ( ( bExp | bSig ) == 0 ) {
3836             float_raise( float_flag_invalid STATUS_VAR);
3837             return float64_default_nan;
3838         }
3839         return packFloat64( zSign, 0x7FF, 0 );
3840     }
3841     if ( bExp == 0x7FF ) {
3842         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3843         if ( ( aExp | aSig ) == 0 ) {
3844             float_raise( float_flag_invalid STATUS_VAR);
3845             return float64_default_nan;
3846         }
3847         return packFloat64( zSign, 0x7FF, 0 );
3848     }
3849     if ( aExp == 0 ) {
3850         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3851         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3852     }
3853     if ( bExp == 0 ) {
3854         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3855         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3856     }
3857     zExp = aExp + bExp - 0x3FF;
3858     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3859     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3860     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3861     zSig0 |= ( zSig1 != 0 );
3862     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3863         zSig0 <<= 1;
3864         --zExp;
3865     }
3866     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3867 
3868 }
3869 
3870 /*----------------------------------------------------------------------------
3871 | Returns the result of dividing the double-precision floating-point value `a'
3872 | by the corresponding value `b'.  The operation is performed according to
3873 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3874 *----------------------------------------------------------------------------*/
3875 
3876 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3877 {
3878     flag aSign, bSign, zSign;
3879     int_fast16_t aExp, bExp, zExp;
3880     uint64_t aSig, bSig, zSig;
3881     uint64_t rem0, rem1;
3882     uint64_t term0, term1;
3883     a = float64_squash_input_denormal(a STATUS_VAR);
3884     b = float64_squash_input_denormal(b STATUS_VAR);
3885 
3886     aSig = extractFloat64Frac( a );
3887     aExp = extractFloat64Exp( a );
3888     aSign = extractFloat64Sign( a );
3889     bSig = extractFloat64Frac( b );
3890     bExp = extractFloat64Exp( b );
3891     bSign = extractFloat64Sign( b );
3892     zSign = aSign ^ bSign;
3893     if ( aExp == 0x7FF ) {
3894         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3895         if ( bExp == 0x7FF ) {
3896             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3897             float_raise( float_flag_invalid STATUS_VAR);
3898             return float64_default_nan;
3899         }
3900         return packFloat64( zSign, 0x7FF, 0 );
3901     }
3902     if ( bExp == 0x7FF ) {
3903         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3904         return packFloat64( zSign, 0, 0 );
3905     }
3906     if ( bExp == 0 ) {
3907         if ( bSig == 0 ) {
3908             if ( ( aExp | aSig ) == 0 ) {
3909                 float_raise( float_flag_invalid STATUS_VAR);
3910                 return float64_default_nan;
3911             }
3912             float_raise( float_flag_divbyzero STATUS_VAR);
3913             return packFloat64( zSign, 0x7FF, 0 );
3914         }
3915         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3916     }
3917     if ( aExp == 0 ) {
3918         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3919         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3920     }
3921     zExp = aExp - bExp + 0x3FD;
3922     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3923     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3924     if ( bSig <= ( aSig + aSig ) ) {
3925         aSig >>= 1;
3926         ++zExp;
3927     }
3928     zSig = estimateDiv128To64( aSig, 0, bSig );
3929     if ( ( zSig & 0x1FF ) <= 2 ) {
3930         mul64To128( bSig, zSig, &term0, &term1 );
3931         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3932         while ( (int64_t) rem0 < 0 ) {
3933             --zSig;
3934             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3935         }
3936         zSig |= ( rem1 != 0 );
3937     }
3938     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3939 
3940 }
3941 
3942 /*----------------------------------------------------------------------------
3943 | Returns the remainder of the double-precision floating-point value `a'
3944 | with respect to the corresponding value `b'.  The operation is performed
3945 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3946 *----------------------------------------------------------------------------*/
3947 
3948 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3949 {
3950     flag aSign, zSign;
3951     int_fast16_t aExp, bExp, expDiff;
3952     uint64_t aSig, bSig;
3953     uint64_t q, alternateASig;
3954     int64_t sigMean;
3955 
3956     a = float64_squash_input_denormal(a STATUS_VAR);
3957     b = float64_squash_input_denormal(b STATUS_VAR);
3958     aSig = extractFloat64Frac( a );
3959     aExp = extractFloat64Exp( a );
3960     aSign = extractFloat64Sign( a );
3961     bSig = extractFloat64Frac( b );
3962     bExp = extractFloat64Exp( b );
3963     if ( aExp == 0x7FF ) {
3964         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3965             return propagateFloat64NaN( a, b STATUS_VAR );
3966         }
3967         float_raise( float_flag_invalid STATUS_VAR);
3968         return float64_default_nan;
3969     }
3970     if ( bExp == 0x7FF ) {
3971         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3972         return a;
3973     }
3974     if ( bExp == 0 ) {
3975         if ( bSig == 0 ) {
3976             float_raise( float_flag_invalid STATUS_VAR);
3977             return float64_default_nan;
3978         }
3979         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3980     }
3981     if ( aExp == 0 ) {
3982         if ( aSig == 0 ) return a;
3983         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3984     }
3985     expDiff = aExp - bExp;
3986     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3987     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3988     if ( expDiff < 0 ) {
3989         if ( expDiff < -1 ) return a;
3990         aSig >>= 1;
3991     }
3992     q = ( bSig <= aSig );
3993     if ( q ) aSig -= bSig;
3994     expDiff -= 64;
3995     while ( 0 < expDiff ) {
3996         q = estimateDiv128To64( aSig, 0, bSig );
3997         q = ( 2 < q ) ? q - 2 : 0;
3998         aSig = - ( ( bSig>>2 ) * q );
3999         expDiff -= 62;
4000     }
4001     expDiff += 64;
4002     if ( 0 < expDiff ) {
4003         q = estimateDiv128To64( aSig, 0, bSig );
4004         q = ( 2 < q ) ? q - 2 : 0;
4005         q >>= 64 - expDiff;
4006         bSig >>= 2;
4007         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4008     }
4009     else {
4010         aSig >>= 2;
4011         bSig >>= 2;
4012     }
4013     do {
4014         alternateASig = aSig;
4015         ++q;
4016         aSig -= bSig;
4017     } while ( 0 <= (int64_t) aSig );
4018     sigMean = aSig + alternateASig;
4019     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4020         aSig = alternateASig;
4021     }
4022     zSign = ( (int64_t) aSig < 0 );
4023     if ( zSign ) aSig = - aSig;
4024     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4025 
4026 }
4027 
4028 /*----------------------------------------------------------------------------
4029 | Returns the result of multiplying the double-precision floating-point values
4030 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4031 | multiplication.  The operation is performed according to the IEC/IEEE
4032 | Standard for Binary Floating-Point Arithmetic 754-2008.
4033 | The flags argument allows the caller to select negation of the
4034 | addend, the intermediate product, or the final result. (The difference
4035 | between this and having the caller do a separate negation is that negating
4036 | externally will flip the sign bit on NaNs.)
4037 *----------------------------------------------------------------------------*/
4038 
4039 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4040 {
4041     flag aSign, bSign, cSign, zSign;
4042     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4043     uint64_t aSig, bSig, cSig;
4044     flag pInf, pZero, pSign;
4045     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4046     int shiftcount;
4047     flag signflip, infzero;
4048 
4049     a = float64_squash_input_denormal(a STATUS_VAR);
4050     b = float64_squash_input_denormal(b STATUS_VAR);
4051     c = float64_squash_input_denormal(c STATUS_VAR);
4052     aSig = extractFloat64Frac(a);
4053     aExp = extractFloat64Exp(a);
4054     aSign = extractFloat64Sign(a);
4055     bSig = extractFloat64Frac(b);
4056     bExp = extractFloat64Exp(b);
4057     bSign = extractFloat64Sign(b);
4058     cSig = extractFloat64Frac(c);
4059     cExp = extractFloat64Exp(c);
4060     cSign = extractFloat64Sign(c);
4061 
4062     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4063                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4064 
4065     /* It is implementation-defined whether the cases of (0,inf,qnan)
4066      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4067      * they return if they do), so we have to hand this information
4068      * off to the target-specific pick-a-NaN routine.
4069      */
4070     if (((aExp == 0x7ff) && aSig) ||
4071         ((bExp == 0x7ff) && bSig) ||
4072         ((cExp == 0x7ff) && cSig)) {
4073         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4074     }
4075 
4076     if (infzero) {
4077         float_raise(float_flag_invalid STATUS_VAR);
4078         return float64_default_nan;
4079     }
4080 
4081     if (flags & float_muladd_negate_c) {
4082         cSign ^= 1;
4083     }
4084 
4085     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4086 
4087     /* Work out the sign and type of the product */
4088     pSign = aSign ^ bSign;
4089     if (flags & float_muladd_negate_product) {
4090         pSign ^= 1;
4091     }
4092     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4093     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4094 
4095     if (cExp == 0x7ff) {
4096         if (pInf && (pSign ^ cSign)) {
4097             /* addition of opposite-signed infinities => InvalidOperation */
4098             float_raise(float_flag_invalid STATUS_VAR);
4099             return float64_default_nan;
4100         }
4101         /* Otherwise generate an infinity of the same sign */
4102         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4103     }
4104 
4105     if (pInf) {
4106         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4107     }
4108 
4109     if (pZero) {
4110         if (cExp == 0) {
4111             if (cSig == 0) {
4112                 /* Adding two exact zeroes */
4113                 if (pSign == cSign) {
4114                     zSign = pSign;
4115                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4116                     zSign = 1;
4117                 } else {
4118                     zSign = 0;
4119                 }
4120                 return packFloat64(zSign ^ signflip, 0, 0);
4121             }
4122             /* Exact zero plus a denorm */
4123             if (STATUS(flush_to_zero)) {
4124                 float_raise(float_flag_output_denormal STATUS_VAR);
4125                 return packFloat64(cSign ^ signflip, 0, 0);
4126             }
4127         }
4128         /* Zero plus something non-zero : just return the something */
4129         if (flags & float_muladd_halve_result) {
4130             if (cExp == 0) {
4131                 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4132             }
4133             /* Subtract one to halve, and one again because roundAndPackFloat64
4134              * wants one less than the true exponent.
4135              */
4136             cExp -= 2;
4137             cSig = (cSig | 0x0010000000000000ULL) << 10;
4138             return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4139         }
4140         return packFloat64(cSign ^ signflip, cExp, cSig);
4141     }
4142 
4143     if (aExp == 0) {
4144         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4145     }
4146     if (bExp == 0) {
4147         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4148     }
4149 
4150     /* Calculate the actual result a * b + c */
4151 
4152     /* Multiply first; this is easy. */
4153     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4154      * because we want the true exponent, not the "one-less-than"
4155      * flavour that roundAndPackFloat64() takes.
4156      */
4157     pExp = aExp + bExp - 0x3fe;
4158     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4159     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4160     mul64To128(aSig, bSig, &pSig0, &pSig1);
4161     if ((int64_t)(pSig0 << 1) >= 0) {
4162         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4163         pExp--;
4164     }
4165 
4166     zSign = pSign ^ signflip;
4167 
4168     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4169      * bit in position 126.
4170      */
4171     if (cExp == 0) {
4172         if (!cSig) {
4173             /* Throw out the special case of c being an exact zero now */
4174             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4175             if (flags & float_muladd_halve_result) {
4176                 pExp--;
4177             }
4178             return roundAndPackFloat64(zSign, pExp - 1,
4179                                        pSig1 STATUS_VAR);
4180         }
4181         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4182     }
4183 
4184     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4185      * significand of the addend, with the explicit bit in position 126.
4186      */
4187     cSig0 = cSig << (126 - 64 - 52);
4188     cSig1 = 0;
4189     cSig0 |= LIT64(0x4000000000000000);
4190     expDiff = pExp - cExp;
4191 
4192     if (pSign == cSign) {
4193         /* Addition */
4194         if (expDiff > 0) {
4195             /* scale c to match p */
4196             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4197             zExp = pExp;
4198         } else if (expDiff < 0) {
4199             /* scale p to match c */
4200             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4201             zExp = cExp;
4202         } else {
4203             /* no scaling needed */
4204             zExp = cExp;
4205         }
4206         /* Add significands and make sure explicit bit ends up in posn 126 */
4207         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4208         if ((int64_t)zSig0 < 0) {
4209             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4210         } else {
4211             zExp--;
4212         }
4213         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4214         if (flags & float_muladd_halve_result) {
4215             zExp--;
4216         }
4217         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4218     } else {
4219         /* Subtraction */
4220         if (expDiff > 0) {
4221             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4222             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4223             zExp = pExp;
4224         } else if (expDiff < 0) {
4225             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4226             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4227             zExp = cExp;
4228             zSign ^= 1;
4229         } else {
4230             zExp = pExp;
4231             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4232                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4233             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4234                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4235                 zSign ^= 1;
4236             } else {
4237                 /* Exact zero */
4238                 zSign = signflip;
4239                 if (STATUS(float_rounding_mode) == float_round_down) {
4240                     zSign ^= 1;
4241                 }
4242                 return packFloat64(zSign, 0, 0);
4243             }
4244         }
4245         --zExp;
4246         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4247          * starting with the significand in a pair of uint64_t.
4248          */
4249         if (zSig0) {
4250             shiftcount = countLeadingZeros64(zSig0) - 1;
4251             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4252             if (zSig1) {
4253                 zSig0 |= 1;
4254             }
4255             zExp -= shiftcount;
4256         } else {
4257             shiftcount = countLeadingZeros64(zSig1);
4258             if (shiftcount == 0) {
4259                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4260                 zExp -= 63;
4261             } else {
4262                 shiftcount--;
4263                 zSig0 = zSig1 << shiftcount;
4264                 zExp -= (shiftcount + 64);
4265             }
4266         }
4267         if (flags & float_muladd_halve_result) {
4268             zExp--;
4269         }
4270         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4271     }
4272 }
4273 
4274 /*----------------------------------------------------------------------------
4275 | Returns the square root of the double-precision floating-point value `a'.
4276 | The operation is performed according to the IEC/IEEE Standard for Binary
4277 | Floating-Point Arithmetic.
4278 *----------------------------------------------------------------------------*/
4279 
4280 float64 float64_sqrt( float64 a STATUS_PARAM )
4281 {
4282     flag aSign;
4283     int_fast16_t aExp, zExp;
4284     uint64_t aSig, zSig, doubleZSig;
4285     uint64_t rem0, rem1, term0, term1;
4286     a = float64_squash_input_denormal(a STATUS_VAR);
4287 
4288     aSig = extractFloat64Frac( a );
4289     aExp = extractFloat64Exp( a );
4290     aSign = extractFloat64Sign( a );
4291     if ( aExp == 0x7FF ) {
4292         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4293         if ( ! aSign ) return a;
4294         float_raise( float_flag_invalid STATUS_VAR);
4295         return float64_default_nan;
4296     }
4297     if ( aSign ) {
4298         if ( ( aExp | aSig ) == 0 ) return a;
4299         float_raise( float_flag_invalid STATUS_VAR);
4300         return float64_default_nan;
4301     }
4302     if ( aExp == 0 ) {
4303         if ( aSig == 0 ) return float64_zero;
4304         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4305     }
4306     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4307     aSig |= LIT64( 0x0010000000000000 );
4308     zSig = estimateSqrt32( aExp, aSig>>21 );
4309     aSig <<= 9 - ( aExp & 1 );
4310     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4311     if ( ( zSig & 0x1FF ) <= 5 ) {
4312         doubleZSig = zSig<<1;
4313         mul64To128( zSig, zSig, &term0, &term1 );
4314         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4315         while ( (int64_t) rem0 < 0 ) {
4316             --zSig;
4317             doubleZSig -= 2;
4318             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4319         }
4320         zSig |= ( ( rem0 | rem1 ) != 0 );
4321     }
4322     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4323 
4324 }
4325 
4326 /*----------------------------------------------------------------------------
4327 | Returns the binary log of the double-precision floating-point value `a'.
4328 | The operation is performed according to the IEC/IEEE Standard for Binary
4329 | Floating-Point Arithmetic.
4330 *----------------------------------------------------------------------------*/
4331 float64 float64_log2( float64 a STATUS_PARAM )
4332 {
4333     flag aSign, zSign;
4334     int_fast16_t aExp;
4335     uint64_t aSig, aSig0, aSig1, zSig, i;
4336     a = float64_squash_input_denormal(a STATUS_VAR);
4337 
4338     aSig = extractFloat64Frac( a );
4339     aExp = extractFloat64Exp( a );
4340     aSign = extractFloat64Sign( a );
4341 
4342     if ( aExp == 0 ) {
4343         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4344         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4345     }
4346     if ( aSign ) {
4347         float_raise( float_flag_invalid STATUS_VAR);
4348         return float64_default_nan;
4349     }
4350     if ( aExp == 0x7FF ) {
4351         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4352         return a;
4353     }
4354 
4355     aExp -= 0x3FF;
4356     aSig |= LIT64( 0x0010000000000000 );
4357     zSign = aExp < 0;
4358     zSig = (uint64_t)aExp << 52;
4359     for (i = 1LL << 51; i > 0; i >>= 1) {
4360         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4361         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4362         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4363             aSig >>= 1;
4364             zSig |= i;
4365         }
4366     }
4367 
4368     if ( zSign )
4369         zSig = -zSig;
4370     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4371 }
4372 
4373 /*----------------------------------------------------------------------------
4374 | Returns 1 if the double-precision floating-point value `a' is equal to the
4375 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4376 | if either operand is a NaN.  Otherwise, the comparison is performed
4377 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4378 *----------------------------------------------------------------------------*/
4379 
4380 int float64_eq( float64 a, float64 b STATUS_PARAM )
4381 {
4382     uint64_t av, bv;
4383     a = float64_squash_input_denormal(a STATUS_VAR);
4384     b = float64_squash_input_denormal(b STATUS_VAR);
4385 
4386     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4387          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4388        ) {
4389         float_raise( float_flag_invalid STATUS_VAR);
4390         return 0;
4391     }
4392     av = float64_val(a);
4393     bv = float64_val(b);
4394     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4395 
4396 }
4397 
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point value `a' is less than or
4400 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4401 | exception is raised if either operand is a NaN.  The comparison is performed
4402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4404 
4405 int float64_le( float64 a, float64 b STATUS_PARAM )
4406 {
4407     flag aSign, bSign;
4408     uint64_t av, bv;
4409     a = float64_squash_input_denormal(a STATUS_VAR);
4410     b = float64_squash_input_denormal(b STATUS_VAR);
4411 
4412     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4413          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4414        ) {
4415         float_raise( float_flag_invalid STATUS_VAR);
4416         return 0;
4417     }
4418     aSign = extractFloat64Sign( a );
4419     bSign = extractFloat64Sign( b );
4420     av = float64_val(a);
4421     bv = float64_val(b);
4422     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4423     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4424 
4425 }
4426 
4427 /*----------------------------------------------------------------------------
4428 | Returns 1 if the double-precision floating-point value `a' is less than
4429 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4430 | raised if either operand is a NaN.  The comparison is performed according
4431 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4432 *----------------------------------------------------------------------------*/
4433 
4434 int float64_lt( float64 a, float64 b STATUS_PARAM )
4435 {
4436     flag aSign, bSign;
4437     uint64_t av, bv;
4438 
4439     a = float64_squash_input_denormal(a STATUS_VAR);
4440     b = float64_squash_input_denormal(b STATUS_VAR);
4441     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4442          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4443        ) {
4444         float_raise( float_flag_invalid STATUS_VAR);
4445         return 0;
4446     }
4447     aSign = extractFloat64Sign( a );
4448     bSign = extractFloat64Sign( b );
4449     av = float64_val(a);
4450     bv = float64_val(b);
4451     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4452     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4453 
4454 }
4455 
4456 /*----------------------------------------------------------------------------
4457 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4458 | be compared, and 0 otherwise.  The invalid exception is raised if either
4459 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4460 | Standard for Binary Floating-Point Arithmetic.
4461 *----------------------------------------------------------------------------*/
4462 
4463 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4464 {
4465     a = float64_squash_input_denormal(a STATUS_VAR);
4466     b = float64_squash_input_denormal(b STATUS_VAR);
4467 
4468     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4469          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4470        ) {
4471         float_raise( float_flag_invalid STATUS_VAR);
4472         return 1;
4473     }
4474     return 0;
4475 }
4476 
4477 /*----------------------------------------------------------------------------
4478 | Returns 1 if the double-precision floating-point value `a' is equal to the
4479 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4480 | exception.The comparison is performed according to the IEC/IEEE Standard
4481 | for Binary Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483 
4484 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4485 {
4486     uint64_t av, bv;
4487     a = float64_squash_input_denormal(a STATUS_VAR);
4488     b = float64_squash_input_denormal(b STATUS_VAR);
4489 
4490     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4491          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4492        ) {
4493         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4494             float_raise( float_flag_invalid STATUS_VAR);
4495         }
4496         return 0;
4497     }
4498     av = float64_val(a);
4499     bv = float64_val(b);
4500     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4501 
4502 }
4503 
4504 /*----------------------------------------------------------------------------
4505 | Returns 1 if the double-precision floating-point value `a' is less than or
4506 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4507 | cause an exception.  Otherwise, the comparison is performed according to the
4508 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4509 *----------------------------------------------------------------------------*/
4510 
4511 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4512 {
4513     flag aSign, bSign;
4514     uint64_t av, bv;
4515     a = float64_squash_input_denormal(a STATUS_VAR);
4516     b = float64_squash_input_denormal(b STATUS_VAR);
4517 
4518     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4519          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4520        ) {
4521         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4522             float_raise( float_flag_invalid STATUS_VAR);
4523         }
4524         return 0;
4525     }
4526     aSign = extractFloat64Sign( a );
4527     bSign = extractFloat64Sign( b );
4528     av = float64_val(a);
4529     bv = float64_val(b);
4530     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4531     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4532 
4533 }
4534 
4535 /*----------------------------------------------------------------------------
4536 | Returns 1 if the double-precision floating-point value `a' is less than
4537 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4538 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4539 | Standard for Binary Floating-Point Arithmetic.
4540 *----------------------------------------------------------------------------*/
4541 
4542 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4543 {
4544     flag aSign, bSign;
4545     uint64_t av, bv;
4546     a = float64_squash_input_denormal(a STATUS_VAR);
4547     b = float64_squash_input_denormal(b STATUS_VAR);
4548 
4549     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4550          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4551        ) {
4552         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4553             float_raise( float_flag_invalid STATUS_VAR);
4554         }
4555         return 0;
4556     }
4557     aSign = extractFloat64Sign( a );
4558     bSign = extractFloat64Sign( b );
4559     av = float64_val(a);
4560     bv = float64_val(b);
4561     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4562     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4563 
4564 }
4565 
4566 /*----------------------------------------------------------------------------
4567 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4568 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4569 | comparison is performed according to the IEC/IEEE Standard for Binary
4570 | Floating-Point Arithmetic.
4571 *----------------------------------------------------------------------------*/
4572 
4573 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4574 {
4575     a = float64_squash_input_denormal(a STATUS_VAR);
4576     b = float64_squash_input_denormal(b STATUS_VAR);
4577 
4578     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4579          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4580        ) {
4581         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4582             float_raise( float_flag_invalid STATUS_VAR);
4583         }
4584         return 1;
4585     }
4586     return 0;
4587 }
4588 
4589 /*----------------------------------------------------------------------------
4590 | Returns the result of converting the extended double-precision floating-
4591 | point value `a' to the 32-bit two's complement integer format.  The
4592 | conversion is performed according to the IEC/IEEE Standard for Binary
4593 | Floating-Point Arithmetic---which means in particular that the conversion
4594 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4595 | largest positive integer is returned.  Otherwise, if the conversion
4596 | overflows, the largest integer with the same sign as `a' is returned.
4597 *----------------------------------------------------------------------------*/
4598 
4599 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4600 {
4601     flag aSign;
4602     int32 aExp, shiftCount;
4603     uint64_t aSig;
4604 
4605     aSig = extractFloatx80Frac( a );
4606     aExp = extractFloatx80Exp( a );
4607     aSign = extractFloatx80Sign( a );
4608     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4609     shiftCount = 0x4037 - aExp;
4610     if ( shiftCount <= 0 ) shiftCount = 1;
4611     shift64RightJamming( aSig, shiftCount, &aSig );
4612     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4613 
4614 }
4615 
4616 /*----------------------------------------------------------------------------
4617 | Returns the result of converting the extended double-precision floating-
4618 | point value `a' to the 32-bit two's complement integer format.  The
4619 | conversion is performed according to the IEC/IEEE Standard for Binary
4620 | Floating-Point Arithmetic, except that the conversion is always rounded
4621 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4622 | Otherwise, if the conversion overflows, the largest integer with the same
4623 | sign as `a' is returned.
4624 *----------------------------------------------------------------------------*/
4625 
4626 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4627 {
4628     flag aSign;
4629     int32 aExp, shiftCount;
4630     uint64_t aSig, savedASig;
4631     int32_t z;
4632 
4633     aSig = extractFloatx80Frac( a );
4634     aExp = extractFloatx80Exp( a );
4635     aSign = extractFloatx80Sign( a );
4636     if ( 0x401E < aExp ) {
4637         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4638         goto invalid;
4639     }
4640     else if ( aExp < 0x3FFF ) {
4641         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4642         return 0;
4643     }
4644     shiftCount = 0x403E - aExp;
4645     savedASig = aSig;
4646     aSig >>= shiftCount;
4647     z = aSig;
4648     if ( aSign ) z = - z;
4649     if ( ( z < 0 ) ^ aSign ) {
4650  invalid:
4651         float_raise( float_flag_invalid STATUS_VAR);
4652         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4653     }
4654     if ( ( aSig<<shiftCount ) != savedASig ) {
4655         STATUS(float_exception_flags) |= float_flag_inexact;
4656     }
4657     return z;
4658 
4659 }
4660 
4661 /*----------------------------------------------------------------------------
4662 | Returns the result of converting the extended double-precision floating-
4663 | point value `a' to the 64-bit two's complement integer format.  The
4664 | conversion is performed according to the IEC/IEEE Standard for Binary
4665 | Floating-Point Arithmetic---which means in particular that the conversion
4666 | is rounded according to the current rounding mode.  If `a' is a NaN,
4667 | the largest positive integer is returned.  Otherwise, if the conversion
4668 | overflows, the largest integer with the same sign as `a' is returned.
4669 *----------------------------------------------------------------------------*/
4670 
4671 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4672 {
4673     flag aSign;
4674     int32 aExp, shiftCount;
4675     uint64_t aSig, aSigExtra;
4676 
4677     aSig = extractFloatx80Frac( a );
4678     aExp = extractFloatx80Exp( a );
4679     aSign = extractFloatx80Sign( a );
4680     shiftCount = 0x403E - aExp;
4681     if ( shiftCount <= 0 ) {
4682         if ( shiftCount ) {
4683             float_raise( float_flag_invalid STATUS_VAR);
4684             if (    ! aSign
4685                  || (    ( aExp == 0x7FFF )
4686                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4687                ) {
4688                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4689             }
4690             return (int64_t) LIT64( 0x8000000000000000 );
4691         }
4692         aSigExtra = 0;
4693     }
4694     else {
4695         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4696     }
4697     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4698 
4699 }
4700 
4701 /*----------------------------------------------------------------------------
4702 | Returns the result of converting the extended double-precision floating-
4703 | point value `a' to the 64-bit two's complement integer format.  The
4704 | conversion is performed according to the IEC/IEEE Standard for Binary
4705 | Floating-Point Arithmetic, except that the conversion is always rounded
4706 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4707 | Otherwise, if the conversion overflows, the largest integer with the same
4708 | sign as `a' is returned.
4709 *----------------------------------------------------------------------------*/
4710 
4711 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4712 {
4713     flag aSign;
4714     int32 aExp, shiftCount;
4715     uint64_t aSig;
4716     int64 z;
4717 
4718     aSig = extractFloatx80Frac( a );
4719     aExp = extractFloatx80Exp( a );
4720     aSign = extractFloatx80Sign( a );
4721     shiftCount = aExp - 0x403E;
4722     if ( 0 <= shiftCount ) {
4723         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4724         if ( ( a.high != 0xC03E ) || aSig ) {
4725             float_raise( float_flag_invalid STATUS_VAR);
4726             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4727                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4728             }
4729         }
4730         return (int64_t) LIT64( 0x8000000000000000 );
4731     }
4732     else if ( aExp < 0x3FFF ) {
4733         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4734         return 0;
4735     }
4736     z = aSig>>( - shiftCount );
4737     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4738         STATUS(float_exception_flags) |= float_flag_inexact;
4739     }
4740     if ( aSign ) z = - z;
4741     return z;
4742 
4743 }
4744 
4745 /*----------------------------------------------------------------------------
4746 | Returns the result of converting the extended double-precision floating-
4747 | point value `a' to the single-precision floating-point format.  The
4748 | conversion is performed according to the IEC/IEEE Standard for Binary
4749 | Floating-Point Arithmetic.
4750 *----------------------------------------------------------------------------*/
4751 
4752 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4753 {
4754     flag aSign;
4755     int32 aExp;
4756     uint64_t aSig;
4757 
4758     aSig = extractFloatx80Frac( a );
4759     aExp = extractFloatx80Exp( a );
4760     aSign = extractFloatx80Sign( a );
4761     if ( aExp == 0x7FFF ) {
4762         if ( (uint64_t) ( aSig<<1 ) ) {
4763             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4764         }
4765         return packFloat32( aSign, 0xFF, 0 );
4766     }
4767     shift64RightJamming( aSig, 33, &aSig );
4768     if ( aExp || aSig ) aExp -= 0x3F81;
4769     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4770 
4771 }
4772 
4773 /*----------------------------------------------------------------------------
4774 | Returns the result of converting the extended double-precision floating-
4775 | point value `a' to the double-precision floating-point format.  The
4776 | conversion is performed according to the IEC/IEEE Standard for Binary
4777 | Floating-Point Arithmetic.
4778 *----------------------------------------------------------------------------*/
4779 
4780 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4781 {
4782     flag aSign;
4783     int32 aExp;
4784     uint64_t aSig, zSig;
4785 
4786     aSig = extractFloatx80Frac( a );
4787     aExp = extractFloatx80Exp( a );
4788     aSign = extractFloatx80Sign( a );
4789     if ( aExp == 0x7FFF ) {
4790         if ( (uint64_t) ( aSig<<1 ) ) {
4791             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4792         }
4793         return packFloat64( aSign, 0x7FF, 0 );
4794     }
4795     shift64RightJamming( aSig, 1, &zSig );
4796     if ( aExp || aSig ) aExp -= 0x3C01;
4797     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4798 
4799 }
4800 
4801 /*----------------------------------------------------------------------------
4802 | Returns the result of converting the extended double-precision floating-
4803 | point value `a' to the quadruple-precision floating-point format.  The
4804 | conversion is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic.
4806 *----------------------------------------------------------------------------*/
4807 
4808 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4809 {
4810     flag aSign;
4811     int_fast16_t aExp;
4812     uint64_t aSig, zSig0, zSig1;
4813 
4814     aSig = extractFloatx80Frac( a );
4815     aExp = extractFloatx80Exp( a );
4816     aSign = extractFloatx80Sign( a );
4817     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4818         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4819     }
4820     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4821     return packFloat128( aSign, aExp, zSig0, zSig1 );
4822 
4823 }
4824 
4825 /*----------------------------------------------------------------------------
4826 | Rounds the extended double-precision floating-point value `a' to an integer,
4827 | and returns the result as an extended quadruple-precision floating-point
4828 | value.  The operation is performed according to the IEC/IEEE Standard for
4829 | Binary Floating-Point Arithmetic.
4830 *----------------------------------------------------------------------------*/
4831 
4832 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4833 {
4834     flag aSign;
4835     int32 aExp;
4836     uint64_t lastBitMask, roundBitsMask;
4837     floatx80 z;
4838 
4839     aExp = extractFloatx80Exp( a );
4840     if ( 0x403E <= aExp ) {
4841         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4842             return propagateFloatx80NaN( a, a STATUS_VAR );
4843         }
4844         return a;
4845     }
4846     if ( aExp < 0x3FFF ) {
4847         if (    ( aExp == 0 )
4848              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4849             return a;
4850         }
4851         STATUS(float_exception_flags) |= float_flag_inexact;
4852         aSign = extractFloatx80Sign( a );
4853         switch ( STATUS(float_rounding_mode) ) {
4854          case float_round_nearest_even:
4855             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4856                ) {
4857                 return
4858                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4859             }
4860             break;
4861         case float_round_ties_away:
4862             if (aExp == 0x3FFE) {
4863                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4864             }
4865             break;
4866          case float_round_down:
4867             return
4868                   aSign ?
4869                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4870                 : packFloatx80( 0, 0, 0 );
4871          case float_round_up:
4872             return
4873                   aSign ? packFloatx80( 1, 0, 0 )
4874                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4875         }
4876         return packFloatx80( aSign, 0, 0 );
4877     }
4878     lastBitMask = 1;
4879     lastBitMask <<= 0x403E - aExp;
4880     roundBitsMask = lastBitMask - 1;
4881     z = a;
4882     switch (STATUS(float_rounding_mode)) {
4883     case float_round_nearest_even:
4884         z.low += lastBitMask>>1;
4885         if ((z.low & roundBitsMask) == 0) {
4886             z.low &= ~lastBitMask;
4887         }
4888         break;
4889     case float_round_ties_away:
4890         z.low += lastBitMask >> 1;
4891         break;
4892     case float_round_to_zero:
4893         break;
4894     case float_round_up:
4895         if (!extractFloatx80Sign(z)) {
4896             z.low += roundBitsMask;
4897         }
4898         break;
4899     case float_round_down:
4900         if (extractFloatx80Sign(z)) {
4901             z.low += roundBitsMask;
4902         }
4903         break;
4904     default:
4905         abort();
4906     }
4907     z.low &= ~ roundBitsMask;
4908     if ( z.low == 0 ) {
4909         ++z.high;
4910         z.low = LIT64( 0x8000000000000000 );
4911     }
4912     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4913     return z;
4914 
4915 }
4916 
4917 /*----------------------------------------------------------------------------
4918 | Returns the result of adding the absolute values of the extended double-
4919 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4920 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4921 | The addition is performed according to the IEC/IEEE Standard for Binary
4922 | Floating-Point Arithmetic.
4923 *----------------------------------------------------------------------------*/
4924 
4925 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4926 {
4927     int32 aExp, bExp, zExp;
4928     uint64_t aSig, bSig, zSig0, zSig1;
4929     int32 expDiff;
4930 
4931     aSig = extractFloatx80Frac( a );
4932     aExp = extractFloatx80Exp( a );
4933     bSig = extractFloatx80Frac( b );
4934     bExp = extractFloatx80Exp( b );
4935     expDiff = aExp - bExp;
4936     if ( 0 < expDiff ) {
4937         if ( aExp == 0x7FFF ) {
4938             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4939             return a;
4940         }
4941         if ( bExp == 0 ) --expDiff;
4942         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4943         zExp = aExp;
4944     }
4945     else if ( expDiff < 0 ) {
4946         if ( bExp == 0x7FFF ) {
4947             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4948             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4949         }
4950         if ( aExp == 0 ) ++expDiff;
4951         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4952         zExp = bExp;
4953     }
4954     else {
4955         if ( aExp == 0x7FFF ) {
4956             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4957                 return propagateFloatx80NaN( a, b STATUS_VAR );
4958             }
4959             return a;
4960         }
4961         zSig1 = 0;
4962         zSig0 = aSig + bSig;
4963         if ( aExp == 0 ) {
4964             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4965             goto roundAndPack;
4966         }
4967         zExp = aExp;
4968         goto shiftRight1;
4969     }
4970     zSig0 = aSig + bSig;
4971     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4972  shiftRight1:
4973     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4974     zSig0 |= LIT64( 0x8000000000000000 );
4975     ++zExp;
4976  roundAndPack:
4977     return
4978         roundAndPackFloatx80(
4979             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4980 
4981 }
4982 
4983 /*----------------------------------------------------------------------------
4984 | Returns the result of subtracting the absolute values of the extended
4985 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4986 | difference is negated before being returned.  `zSign' is ignored if the
4987 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4988 | Standard for Binary Floating-Point Arithmetic.
4989 *----------------------------------------------------------------------------*/
4990 
4991 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4992 {
4993     int32 aExp, bExp, zExp;
4994     uint64_t aSig, bSig, zSig0, zSig1;
4995     int32 expDiff;
4996     floatx80 z;
4997 
4998     aSig = extractFloatx80Frac( a );
4999     aExp = extractFloatx80Exp( a );
5000     bSig = extractFloatx80Frac( b );
5001     bExp = extractFloatx80Exp( b );
5002     expDiff = aExp - bExp;
5003     if ( 0 < expDiff ) goto aExpBigger;
5004     if ( expDiff < 0 ) goto bExpBigger;
5005     if ( aExp == 0x7FFF ) {
5006         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5007             return propagateFloatx80NaN( a, b STATUS_VAR );
5008         }
5009         float_raise( float_flag_invalid STATUS_VAR);
5010         z.low = floatx80_default_nan_low;
5011         z.high = floatx80_default_nan_high;
5012         return z;
5013     }
5014     if ( aExp == 0 ) {
5015         aExp = 1;
5016         bExp = 1;
5017     }
5018     zSig1 = 0;
5019     if ( bSig < aSig ) goto aBigger;
5020     if ( aSig < bSig ) goto bBigger;
5021     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5022  bExpBigger:
5023     if ( bExp == 0x7FFF ) {
5024         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5025         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5026     }
5027     if ( aExp == 0 ) ++expDiff;
5028     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5029  bBigger:
5030     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5031     zExp = bExp;
5032     zSign ^= 1;
5033     goto normalizeRoundAndPack;
5034  aExpBigger:
5035     if ( aExp == 0x7FFF ) {
5036         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5037         return a;
5038     }
5039     if ( bExp == 0 ) --expDiff;
5040     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5041  aBigger:
5042     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5043     zExp = aExp;
5044  normalizeRoundAndPack:
5045     return
5046         normalizeRoundAndPackFloatx80(
5047             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5048 
5049 }
5050 
5051 /*----------------------------------------------------------------------------
5052 | Returns the result of adding the extended double-precision floating-point
5053 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5054 | Standard for Binary Floating-Point Arithmetic.
5055 *----------------------------------------------------------------------------*/
5056 
5057 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5058 {
5059     flag aSign, bSign;
5060 
5061     aSign = extractFloatx80Sign( a );
5062     bSign = extractFloatx80Sign( b );
5063     if ( aSign == bSign ) {
5064         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5065     }
5066     else {
5067         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5068     }
5069 
5070 }
5071 
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of subtracting the extended double-precision floating-
5074 | point values `a' and `b'.  The operation is performed according to the
5075 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5076 *----------------------------------------------------------------------------*/
5077 
5078 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5079 {
5080     flag aSign, bSign;
5081 
5082     aSign = extractFloatx80Sign( a );
5083     bSign = extractFloatx80Sign( b );
5084     if ( aSign == bSign ) {
5085         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5086     }
5087     else {
5088         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5089     }
5090 
5091 }
5092 
5093 /*----------------------------------------------------------------------------
5094 | Returns the result of multiplying the extended double-precision floating-
5095 | point values `a' and `b'.  The operation is performed according to the
5096 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5097 *----------------------------------------------------------------------------*/
5098 
5099 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5100 {
5101     flag aSign, bSign, zSign;
5102     int32 aExp, bExp, zExp;
5103     uint64_t aSig, bSig, zSig0, zSig1;
5104     floatx80 z;
5105 
5106     aSig = extractFloatx80Frac( a );
5107     aExp = extractFloatx80Exp( a );
5108     aSign = extractFloatx80Sign( a );
5109     bSig = extractFloatx80Frac( b );
5110     bExp = extractFloatx80Exp( b );
5111     bSign = extractFloatx80Sign( b );
5112     zSign = aSign ^ bSign;
5113     if ( aExp == 0x7FFF ) {
5114         if (    (uint64_t) ( aSig<<1 )
5115              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5116             return propagateFloatx80NaN( a, b STATUS_VAR );
5117         }
5118         if ( ( bExp | bSig ) == 0 ) goto invalid;
5119         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5120     }
5121     if ( bExp == 0x7FFF ) {
5122         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5123         if ( ( aExp | aSig ) == 0 ) {
5124  invalid:
5125             float_raise( float_flag_invalid STATUS_VAR);
5126             z.low = floatx80_default_nan_low;
5127             z.high = floatx80_default_nan_high;
5128             return z;
5129         }
5130         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5131     }
5132     if ( aExp == 0 ) {
5133         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5134         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5135     }
5136     if ( bExp == 0 ) {
5137         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5138         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5139     }
5140     zExp = aExp + bExp - 0x3FFE;
5141     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5142     if ( 0 < (int64_t) zSig0 ) {
5143         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5144         --zExp;
5145     }
5146     return
5147         roundAndPackFloatx80(
5148             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5149 
5150 }
5151 
5152 /*----------------------------------------------------------------------------
5153 | Returns the result of dividing the extended double-precision floating-point
5154 | value `a' by the corresponding value `b'.  The operation is performed
5155 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5157 
5158 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5159 {
5160     flag aSign, bSign, zSign;
5161     int32 aExp, bExp, zExp;
5162     uint64_t aSig, bSig, zSig0, zSig1;
5163     uint64_t rem0, rem1, rem2, term0, term1, term2;
5164     floatx80 z;
5165 
5166     aSig = extractFloatx80Frac( a );
5167     aExp = extractFloatx80Exp( a );
5168     aSign = extractFloatx80Sign( a );
5169     bSig = extractFloatx80Frac( b );
5170     bExp = extractFloatx80Exp( b );
5171     bSign = extractFloatx80Sign( b );
5172     zSign = aSign ^ bSign;
5173     if ( aExp == 0x7FFF ) {
5174         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5175         if ( bExp == 0x7FFF ) {
5176             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5177             goto invalid;
5178         }
5179         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5180     }
5181     if ( bExp == 0x7FFF ) {
5182         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5183         return packFloatx80( zSign, 0, 0 );
5184     }
5185     if ( bExp == 0 ) {
5186         if ( bSig == 0 ) {
5187             if ( ( aExp | aSig ) == 0 ) {
5188  invalid:
5189                 float_raise( float_flag_invalid STATUS_VAR);
5190                 z.low = floatx80_default_nan_low;
5191                 z.high = floatx80_default_nan_high;
5192                 return z;
5193             }
5194             float_raise( float_flag_divbyzero STATUS_VAR);
5195             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5196         }
5197         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5198     }
5199     if ( aExp == 0 ) {
5200         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5201         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5202     }
5203     zExp = aExp - bExp + 0x3FFE;
5204     rem1 = 0;
5205     if ( bSig <= aSig ) {
5206         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5207         ++zExp;
5208     }
5209     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5210     mul64To128( bSig, zSig0, &term0, &term1 );
5211     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5212     while ( (int64_t) rem0 < 0 ) {
5213         --zSig0;
5214         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5215     }
5216     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5217     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5218         mul64To128( bSig, zSig1, &term1, &term2 );
5219         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5220         while ( (int64_t) rem1 < 0 ) {
5221             --zSig1;
5222             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5223         }
5224         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5225     }
5226     return
5227         roundAndPackFloatx80(
5228             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5229 
5230 }
5231 
5232 /*----------------------------------------------------------------------------
5233 | Returns the remainder of the extended double-precision floating-point value
5234 | `a' with respect to the corresponding value `b'.  The operation is performed
5235 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5236 *----------------------------------------------------------------------------*/
5237 
5238 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5239 {
5240     flag aSign, zSign;
5241     int32 aExp, bExp, expDiff;
5242     uint64_t aSig0, aSig1, bSig;
5243     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5244     floatx80 z;
5245 
5246     aSig0 = extractFloatx80Frac( a );
5247     aExp = extractFloatx80Exp( a );
5248     aSign = extractFloatx80Sign( a );
5249     bSig = extractFloatx80Frac( b );
5250     bExp = extractFloatx80Exp( b );
5251     if ( aExp == 0x7FFF ) {
5252         if (    (uint64_t) ( aSig0<<1 )
5253              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5254             return propagateFloatx80NaN( a, b STATUS_VAR );
5255         }
5256         goto invalid;
5257     }
5258     if ( bExp == 0x7FFF ) {
5259         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5260         return a;
5261     }
5262     if ( bExp == 0 ) {
5263         if ( bSig == 0 ) {
5264  invalid:
5265             float_raise( float_flag_invalid STATUS_VAR);
5266             z.low = floatx80_default_nan_low;
5267             z.high = floatx80_default_nan_high;
5268             return z;
5269         }
5270         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5271     }
5272     if ( aExp == 0 ) {
5273         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5274         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5275     }
5276     bSig |= LIT64( 0x8000000000000000 );
5277     zSign = aSign;
5278     expDiff = aExp - bExp;
5279     aSig1 = 0;
5280     if ( expDiff < 0 ) {
5281         if ( expDiff < -1 ) return a;
5282         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5283         expDiff = 0;
5284     }
5285     q = ( bSig <= aSig0 );
5286     if ( q ) aSig0 -= bSig;
5287     expDiff -= 64;
5288     while ( 0 < expDiff ) {
5289         q = estimateDiv128To64( aSig0, aSig1, bSig );
5290         q = ( 2 < q ) ? q - 2 : 0;
5291         mul64To128( bSig, q, &term0, &term1 );
5292         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5293         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5294         expDiff -= 62;
5295     }
5296     expDiff += 64;
5297     if ( 0 < expDiff ) {
5298         q = estimateDiv128To64( aSig0, aSig1, bSig );
5299         q = ( 2 < q ) ? q - 2 : 0;
5300         q >>= 64 - expDiff;
5301         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5302         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5303         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5304         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5305             ++q;
5306             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5307         }
5308     }
5309     else {
5310         term1 = 0;
5311         term0 = bSig;
5312     }
5313     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5314     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5315          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5316               && ( q & 1 ) )
5317        ) {
5318         aSig0 = alternateASig0;
5319         aSig1 = alternateASig1;
5320         zSign = ! zSign;
5321     }
5322     return
5323         normalizeRoundAndPackFloatx80(
5324             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5325 
5326 }
5327 
5328 /*----------------------------------------------------------------------------
5329 | Returns the square root of the extended double-precision floating-point
5330 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5331 | for Binary Floating-Point Arithmetic.
5332 *----------------------------------------------------------------------------*/
5333 
5334 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5335 {
5336     flag aSign;
5337     int32 aExp, zExp;
5338     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5339     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5340     floatx80 z;
5341 
5342     aSig0 = extractFloatx80Frac( a );
5343     aExp = extractFloatx80Exp( a );
5344     aSign = extractFloatx80Sign( a );
5345     if ( aExp == 0x7FFF ) {
5346         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5347         if ( ! aSign ) return a;
5348         goto invalid;
5349     }
5350     if ( aSign ) {
5351         if ( ( aExp | aSig0 ) == 0 ) return a;
5352  invalid:
5353         float_raise( float_flag_invalid STATUS_VAR);
5354         z.low = floatx80_default_nan_low;
5355         z.high = floatx80_default_nan_high;
5356         return z;
5357     }
5358     if ( aExp == 0 ) {
5359         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5360         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5361     }
5362     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5363     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5364     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5365     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5366     doubleZSig0 = zSig0<<1;
5367     mul64To128( zSig0, zSig0, &term0, &term1 );
5368     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5369     while ( (int64_t) rem0 < 0 ) {
5370         --zSig0;
5371         doubleZSig0 -= 2;
5372         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5373     }
5374     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5375     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5376         if ( zSig1 == 0 ) zSig1 = 1;
5377         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5378         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5379         mul64To128( zSig1, zSig1, &term2, &term3 );
5380         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5381         while ( (int64_t) rem1 < 0 ) {
5382             --zSig1;
5383             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5384             term3 |= 1;
5385             term2 |= doubleZSig0;
5386             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5387         }
5388         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5389     }
5390     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5391     zSig0 |= doubleZSig0;
5392     return
5393         roundAndPackFloatx80(
5394             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5395 
5396 }
5397 
5398 /*----------------------------------------------------------------------------
5399 | Returns 1 if the extended double-precision floating-point value `a' is equal
5400 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5401 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5402 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5403 *----------------------------------------------------------------------------*/
5404 
5405 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5406 {
5407 
5408     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5409               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5410          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5411               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5412        ) {
5413         float_raise( float_flag_invalid STATUS_VAR);
5414         return 0;
5415     }
5416     return
5417            ( a.low == b.low )
5418         && (    ( a.high == b.high )
5419              || (    ( a.low == 0 )
5420                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5421            );
5422 
5423 }
5424 
5425 /*----------------------------------------------------------------------------
5426 | Returns 1 if the extended double-precision floating-point value `a' is
5427 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5428 | invalid exception is raised if either operand is a NaN.  The comparison is
5429 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5430 | Arithmetic.
5431 *----------------------------------------------------------------------------*/
5432 
5433 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5434 {
5435     flag aSign, bSign;
5436 
5437     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5438               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5439          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5440               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5441        ) {
5442         float_raise( float_flag_invalid STATUS_VAR);
5443         return 0;
5444     }
5445     aSign = extractFloatx80Sign( a );
5446     bSign = extractFloatx80Sign( b );
5447     if ( aSign != bSign ) {
5448         return
5449                aSign
5450             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5451                  == 0 );
5452     }
5453     return
5454           aSign ? le128( b.high, b.low, a.high, a.low )
5455         : le128( a.high, a.low, b.high, b.low );
5456 
5457 }
5458 
5459 /*----------------------------------------------------------------------------
5460 | Returns 1 if the extended double-precision floating-point value `a' is
5461 | less than the corresponding value `b', and 0 otherwise.  The invalid
5462 | exception is raised if either operand is a NaN.  The comparison is performed
5463 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5465 
5466 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5467 {
5468     flag aSign, bSign;
5469 
5470     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5471               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5472          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5473               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5474        ) {
5475         float_raise( float_flag_invalid STATUS_VAR);
5476         return 0;
5477     }
5478     aSign = extractFloatx80Sign( a );
5479     bSign = extractFloatx80Sign( b );
5480     if ( aSign != bSign ) {
5481         return
5482                aSign
5483             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5484                  != 0 );
5485     }
5486     return
5487           aSign ? lt128( b.high, b.low, a.high, a.low )
5488         : lt128( a.high, a.low, b.high, b.low );
5489 
5490 }
5491 
5492 /*----------------------------------------------------------------------------
5493 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5494 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5495 | either operand is a NaN.   The comparison is performed according to the
5496 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5497 *----------------------------------------------------------------------------*/
5498 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5499 {
5500     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5501               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5502          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5503               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5504        ) {
5505         float_raise( float_flag_invalid STATUS_VAR);
5506         return 1;
5507     }
5508     return 0;
5509 }
5510 
5511 /*----------------------------------------------------------------------------
5512 | Returns 1 if the extended double-precision floating-point value `a' is
5513 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5514 | cause an exception.  The comparison is performed according to the IEC/IEEE
5515 | Standard for Binary Floating-Point Arithmetic.
5516 *----------------------------------------------------------------------------*/
5517 
5518 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5519 {
5520 
5521     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5522               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5523          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5524               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5525        ) {
5526         if (    floatx80_is_signaling_nan( a )
5527              || floatx80_is_signaling_nan( b ) ) {
5528             float_raise( float_flag_invalid STATUS_VAR);
5529         }
5530         return 0;
5531     }
5532     return
5533            ( a.low == b.low )
5534         && (    ( a.high == b.high )
5535              || (    ( a.low == 0 )
5536                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5537            );
5538 
5539 }
5540 
5541 /*----------------------------------------------------------------------------
5542 | Returns 1 if the extended double-precision floating-point value `a' is less
5543 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5544 | do not cause an exception.  Otherwise, the comparison is performed according
5545 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5546 *----------------------------------------------------------------------------*/
5547 
5548 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5549 {
5550     flag aSign, bSign;
5551 
5552     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5553               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5554          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5555               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5556        ) {
5557         if (    floatx80_is_signaling_nan( a )
5558              || floatx80_is_signaling_nan( b ) ) {
5559             float_raise( float_flag_invalid STATUS_VAR);
5560         }
5561         return 0;
5562     }
5563     aSign = extractFloatx80Sign( a );
5564     bSign = extractFloatx80Sign( b );
5565     if ( aSign != bSign ) {
5566         return
5567                aSign
5568             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5569                  == 0 );
5570     }
5571     return
5572           aSign ? le128( b.high, b.low, a.high, a.low )
5573         : le128( a.high, a.low, b.high, b.low );
5574 
5575 }
5576 
5577 /*----------------------------------------------------------------------------
5578 | Returns 1 if the extended double-precision floating-point value `a' is less
5579 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5580 | an exception.  Otherwise, the comparison is performed according to the
5581 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5582 *----------------------------------------------------------------------------*/
5583 
5584 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5585 {
5586     flag aSign, bSign;
5587 
5588     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5589               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5590          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5591               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5592        ) {
5593         if (    floatx80_is_signaling_nan( a )
5594              || floatx80_is_signaling_nan( b ) ) {
5595             float_raise( float_flag_invalid STATUS_VAR);
5596         }
5597         return 0;
5598     }
5599     aSign = extractFloatx80Sign( a );
5600     bSign = extractFloatx80Sign( b );
5601     if ( aSign != bSign ) {
5602         return
5603                aSign
5604             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5605                  != 0 );
5606     }
5607     return
5608           aSign ? lt128( b.high, b.low, a.high, a.low )
5609         : lt128( a.high, a.low, b.high, b.low );
5610 
5611 }
5612 
5613 /*----------------------------------------------------------------------------
5614 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5615 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5616 | The comparison is performed according to the IEC/IEEE Standard for Binary
5617 | Floating-Point Arithmetic.
5618 *----------------------------------------------------------------------------*/
5619 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5620 {
5621     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5622               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5623          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5624               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5625        ) {
5626         if (    floatx80_is_signaling_nan( a )
5627              || floatx80_is_signaling_nan( b ) ) {
5628             float_raise( float_flag_invalid STATUS_VAR);
5629         }
5630         return 1;
5631     }
5632     return 0;
5633 }
5634 
5635 /*----------------------------------------------------------------------------
5636 | Returns the result of converting the quadruple-precision floating-point
5637 | value `a' to the 32-bit two's complement integer format.  The conversion
5638 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5639 | Arithmetic---which means in particular that the conversion is rounded
5640 | according to the current rounding mode.  If `a' is a NaN, the largest
5641 | positive integer is returned.  Otherwise, if the conversion overflows, the
5642 | largest integer with the same sign as `a' is returned.
5643 *----------------------------------------------------------------------------*/
5644 
5645 int32 float128_to_int32( float128 a STATUS_PARAM )
5646 {
5647     flag aSign;
5648     int32 aExp, shiftCount;
5649     uint64_t aSig0, aSig1;
5650 
5651     aSig1 = extractFloat128Frac1( a );
5652     aSig0 = extractFloat128Frac0( a );
5653     aExp = extractFloat128Exp( a );
5654     aSign = extractFloat128Sign( a );
5655     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5656     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5657     aSig0 |= ( aSig1 != 0 );
5658     shiftCount = 0x4028 - aExp;
5659     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5660     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5661 
5662 }
5663 
5664 /*----------------------------------------------------------------------------
5665 | Returns the result of converting the quadruple-precision floating-point
5666 | value `a' to the 32-bit two's complement integer format.  The conversion
5667 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5668 | Arithmetic, except that the conversion is always rounded toward zero.  If
5669 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5670 | conversion overflows, the largest integer with the same sign as `a' is
5671 | returned.
5672 *----------------------------------------------------------------------------*/
5673 
5674 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5675 {
5676     flag aSign;
5677     int32 aExp, shiftCount;
5678     uint64_t aSig0, aSig1, savedASig;
5679     int32_t z;
5680 
5681     aSig1 = extractFloat128Frac1( a );
5682     aSig0 = extractFloat128Frac0( a );
5683     aExp = extractFloat128Exp( a );
5684     aSign = extractFloat128Sign( a );
5685     aSig0 |= ( aSig1 != 0 );
5686     if ( 0x401E < aExp ) {
5687         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5688         goto invalid;
5689     }
5690     else if ( aExp < 0x3FFF ) {
5691         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5692         return 0;
5693     }
5694     aSig0 |= LIT64( 0x0001000000000000 );
5695     shiftCount = 0x402F - aExp;
5696     savedASig = aSig0;
5697     aSig0 >>= shiftCount;
5698     z = aSig0;
5699     if ( aSign ) z = - z;
5700     if ( ( z < 0 ) ^ aSign ) {
5701  invalid:
5702         float_raise( float_flag_invalid STATUS_VAR);
5703         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5704     }
5705     if ( ( aSig0<<shiftCount ) != savedASig ) {
5706         STATUS(float_exception_flags) |= float_flag_inexact;
5707     }
5708     return z;
5709 
5710 }
5711 
5712 /*----------------------------------------------------------------------------
5713 | Returns the result of converting the quadruple-precision floating-point
5714 | value `a' to the 64-bit two's complement integer format.  The conversion
5715 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5716 | Arithmetic---which means in particular that the conversion is rounded
5717 | according to the current rounding mode.  If `a' is a NaN, the largest
5718 | positive integer is returned.  Otherwise, if the conversion overflows, the
5719 | largest integer with the same sign as `a' is returned.
5720 *----------------------------------------------------------------------------*/
5721 
5722 int64 float128_to_int64( float128 a STATUS_PARAM )
5723 {
5724     flag aSign;
5725     int32 aExp, shiftCount;
5726     uint64_t aSig0, aSig1;
5727 
5728     aSig1 = extractFloat128Frac1( a );
5729     aSig0 = extractFloat128Frac0( a );
5730     aExp = extractFloat128Exp( a );
5731     aSign = extractFloat128Sign( a );
5732     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5733     shiftCount = 0x402F - aExp;
5734     if ( shiftCount <= 0 ) {
5735         if ( 0x403E < aExp ) {
5736             float_raise( float_flag_invalid STATUS_VAR);
5737             if (    ! aSign
5738                  || (    ( aExp == 0x7FFF )
5739                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5740                     )
5741                ) {
5742                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5743             }
5744             return (int64_t) LIT64( 0x8000000000000000 );
5745         }
5746         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5747     }
5748     else {
5749         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5750     }
5751     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5752 
5753 }
5754 
5755 /*----------------------------------------------------------------------------
5756 | Returns the result of converting the quadruple-precision floating-point
5757 | value `a' to the 64-bit two's complement integer format.  The conversion
5758 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5759 | Arithmetic, except that the conversion is always rounded toward zero.
5760 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5761 | the conversion overflows, the largest integer with the same sign as `a' is
5762 | returned.
5763 *----------------------------------------------------------------------------*/
5764 
5765 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5766 {
5767     flag aSign;
5768     int32 aExp, shiftCount;
5769     uint64_t aSig0, aSig1;
5770     int64 z;
5771 
5772     aSig1 = extractFloat128Frac1( a );
5773     aSig0 = extractFloat128Frac0( a );
5774     aExp = extractFloat128Exp( a );
5775     aSign = extractFloat128Sign( a );
5776     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5777     shiftCount = aExp - 0x402F;
5778     if ( 0 < shiftCount ) {
5779         if ( 0x403E <= aExp ) {
5780             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5781             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5782                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5783                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5784             }
5785             else {
5786                 float_raise( float_flag_invalid STATUS_VAR);
5787                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5788                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5789                 }
5790             }
5791             return (int64_t) LIT64( 0x8000000000000000 );
5792         }
5793         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5794         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5795             STATUS(float_exception_flags) |= float_flag_inexact;
5796         }
5797     }
5798     else {
5799         if ( aExp < 0x3FFF ) {
5800             if ( aExp | aSig0 | aSig1 ) {
5801                 STATUS(float_exception_flags) |= float_flag_inexact;
5802             }
5803             return 0;
5804         }
5805         z = aSig0>>( - shiftCount );
5806         if (    aSig1
5807              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5808             STATUS(float_exception_flags) |= float_flag_inexact;
5809         }
5810     }
5811     if ( aSign ) z = - z;
5812     return z;
5813 
5814 }
5815 
5816 /*----------------------------------------------------------------------------
5817 | Returns the result of converting the quadruple-precision floating-point
5818 | value `a' to the single-precision floating-point format.  The conversion
5819 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5820 | Arithmetic.
5821 *----------------------------------------------------------------------------*/
5822 
5823 float32 float128_to_float32( float128 a STATUS_PARAM )
5824 {
5825     flag aSign;
5826     int32 aExp;
5827     uint64_t aSig0, aSig1;
5828     uint32_t zSig;
5829 
5830     aSig1 = extractFloat128Frac1( a );
5831     aSig0 = extractFloat128Frac0( a );
5832     aExp = extractFloat128Exp( a );
5833     aSign = extractFloat128Sign( a );
5834     if ( aExp == 0x7FFF ) {
5835         if ( aSig0 | aSig1 ) {
5836             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5837         }
5838         return packFloat32( aSign, 0xFF, 0 );
5839     }
5840     aSig0 |= ( aSig1 != 0 );
5841     shift64RightJamming( aSig0, 18, &aSig0 );
5842     zSig = aSig0;
5843     if ( aExp || zSig ) {
5844         zSig |= 0x40000000;
5845         aExp -= 0x3F81;
5846     }
5847     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5848 
5849 }
5850 
5851 /*----------------------------------------------------------------------------
5852 | Returns the result of converting the quadruple-precision floating-point
5853 | value `a' to the double-precision floating-point format.  The conversion
5854 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5855 | Arithmetic.
5856 *----------------------------------------------------------------------------*/
5857 
5858 float64 float128_to_float64( float128 a STATUS_PARAM )
5859 {
5860     flag aSign;
5861     int32 aExp;
5862     uint64_t aSig0, aSig1;
5863 
5864     aSig1 = extractFloat128Frac1( a );
5865     aSig0 = extractFloat128Frac0( a );
5866     aExp = extractFloat128Exp( a );
5867     aSign = extractFloat128Sign( a );
5868     if ( aExp == 0x7FFF ) {
5869         if ( aSig0 | aSig1 ) {
5870             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5871         }
5872         return packFloat64( aSign, 0x7FF, 0 );
5873     }
5874     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5875     aSig0 |= ( aSig1 != 0 );
5876     if ( aExp || aSig0 ) {
5877         aSig0 |= LIT64( 0x4000000000000000 );
5878         aExp -= 0x3C01;
5879     }
5880     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5881 
5882 }
5883 
5884 /*----------------------------------------------------------------------------
5885 | Returns the result of converting the quadruple-precision floating-point
5886 | value `a' to the extended double-precision floating-point format.  The
5887 | conversion is performed according to the IEC/IEEE Standard for Binary
5888 | Floating-Point Arithmetic.
5889 *----------------------------------------------------------------------------*/
5890 
5891 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5892 {
5893     flag aSign;
5894     int32 aExp;
5895     uint64_t aSig0, aSig1;
5896 
5897     aSig1 = extractFloat128Frac1( a );
5898     aSig0 = extractFloat128Frac0( a );
5899     aExp = extractFloat128Exp( a );
5900     aSign = extractFloat128Sign( a );
5901     if ( aExp == 0x7FFF ) {
5902         if ( aSig0 | aSig1 ) {
5903             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5904         }
5905         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5906     }
5907     if ( aExp == 0 ) {
5908         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5909         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5910     }
5911     else {
5912         aSig0 |= LIT64( 0x0001000000000000 );
5913     }
5914     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5915     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5916 
5917 }
5918 
5919 /*----------------------------------------------------------------------------
5920 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5921 | returns the result as a quadruple-precision floating-point value.  The
5922 | operation is performed according to the IEC/IEEE Standard for Binary
5923 | Floating-Point Arithmetic.
5924 *----------------------------------------------------------------------------*/
5925 
5926 float128 float128_round_to_int( float128 a STATUS_PARAM )
5927 {
5928     flag aSign;
5929     int32 aExp;
5930     uint64_t lastBitMask, roundBitsMask;
5931     float128 z;
5932 
5933     aExp = extractFloat128Exp( a );
5934     if ( 0x402F <= aExp ) {
5935         if ( 0x406F <= aExp ) {
5936             if (    ( aExp == 0x7FFF )
5937                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5938                ) {
5939                 return propagateFloat128NaN( a, a STATUS_VAR );
5940             }
5941             return a;
5942         }
5943         lastBitMask = 1;
5944         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5945         roundBitsMask = lastBitMask - 1;
5946         z = a;
5947         switch (STATUS(float_rounding_mode)) {
5948         case float_round_nearest_even:
5949             if ( lastBitMask ) {
5950                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5951                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5952             }
5953             else {
5954                 if ( (int64_t) z.low < 0 ) {
5955                     ++z.high;
5956                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5957                 }
5958             }
5959             break;
5960         case float_round_ties_away:
5961             if (lastBitMask) {
5962                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5963             } else {
5964                 if ((int64_t) z.low < 0) {
5965                     ++z.high;
5966                 }
5967             }
5968             break;
5969         case float_round_to_zero:
5970             break;
5971         case float_round_up:
5972             if (!extractFloat128Sign(z)) {
5973                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5974             }
5975             break;
5976         case float_round_down:
5977             if (extractFloat128Sign(z)) {
5978                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5979             }
5980             break;
5981         default:
5982             abort();
5983         }
5984         z.low &= ~ roundBitsMask;
5985     }
5986     else {
5987         if ( aExp < 0x3FFF ) {
5988             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5989             STATUS(float_exception_flags) |= float_flag_inexact;
5990             aSign = extractFloat128Sign( a );
5991             switch ( STATUS(float_rounding_mode) ) {
5992              case float_round_nearest_even:
5993                 if (    ( aExp == 0x3FFE )
5994                      && (   extractFloat128Frac0( a )
5995                           | extractFloat128Frac1( a ) )
5996                    ) {
5997                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5998                 }
5999                 break;
6000             case float_round_ties_away:
6001                 if (aExp == 0x3FFE) {
6002                     return packFloat128(aSign, 0x3FFF, 0, 0);
6003                 }
6004                 break;
6005              case float_round_down:
6006                 return
6007                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6008                     : packFloat128( 0, 0, 0, 0 );
6009              case float_round_up:
6010                 return
6011                       aSign ? packFloat128( 1, 0, 0, 0 )
6012                     : packFloat128( 0, 0x3FFF, 0, 0 );
6013             }
6014             return packFloat128( aSign, 0, 0, 0 );
6015         }
6016         lastBitMask = 1;
6017         lastBitMask <<= 0x402F - aExp;
6018         roundBitsMask = lastBitMask - 1;
6019         z.low = 0;
6020         z.high = a.high;
6021         switch (STATUS(float_rounding_mode)) {
6022         case float_round_nearest_even:
6023             z.high += lastBitMask>>1;
6024             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6025                 z.high &= ~ lastBitMask;
6026             }
6027             break;
6028         case float_round_ties_away:
6029             z.high += lastBitMask>>1;
6030             break;
6031         case float_round_to_zero:
6032             break;
6033         case float_round_up:
6034             if (!extractFloat128Sign(z)) {
6035                 z.high |= ( a.low != 0 );
6036                 z.high += roundBitsMask;
6037             }
6038             break;
6039         case float_round_down:
6040             if (extractFloat128Sign(z)) {
6041                 z.high |= (a.low != 0);
6042                 z.high += roundBitsMask;
6043             }
6044             break;
6045         default:
6046             abort();
6047         }
6048         z.high &= ~ roundBitsMask;
6049     }
6050     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6051         STATUS(float_exception_flags) |= float_flag_inexact;
6052     }
6053     return z;
6054 
6055 }
6056 
6057 /*----------------------------------------------------------------------------
6058 | Returns the result of adding the absolute values of the quadruple-precision
6059 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6060 | before being returned.  `zSign' is ignored if the result is a NaN.
6061 | The addition is performed according to the IEC/IEEE Standard for Binary
6062 | Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6064 
6065 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6066 {
6067     int32 aExp, bExp, zExp;
6068     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6069     int32 expDiff;
6070 
6071     aSig1 = extractFloat128Frac1( a );
6072     aSig0 = extractFloat128Frac0( a );
6073     aExp = extractFloat128Exp( a );
6074     bSig1 = extractFloat128Frac1( b );
6075     bSig0 = extractFloat128Frac0( b );
6076     bExp = extractFloat128Exp( b );
6077     expDiff = aExp - bExp;
6078     if ( 0 < expDiff ) {
6079         if ( aExp == 0x7FFF ) {
6080             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6081             return a;
6082         }
6083         if ( bExp == 0 ) {
6084             --expDiff;
6085         }
6086         else {
6087             bSig0 |= LIT64( 0x0001000000000000 );
6088         }
6089         shift128ExtraRightJamming(
6090             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6091         zExp = aExp;
6092     }
6093     else if ( expDiff < 0 ) {
6094         if ( bExp == 0x7FFF ) {
6095             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6096             return packFloat128( zSign, 0x7FFF, 0, 0 );
6097         }
6098         if ( aExp == 0 ) {
6099             ++expDiff;
6100         }
6101         else {
6102             aSig0 |= LIT64( 0x0001000000000000 );
6103         }
6104         shift128ExtraRightJamming(
6105             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6106         zExp = bExp;
6107     }
6108     else {
6109         if ( aExp == 0x7FFF ) {
6110             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6111                 return propagateFloat128NaN( a, b STATUS_VAR );
6112             }
6113             return a;
6114         }
6115         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6116         if ( aExp == 0 ) {
6117             if (STATUS(flush_to_zero)) {
6118                 if (zSig0 | zSig1) {
6119                     float_raise(float_flag_output_denormal STATUS_VAR);
6120                 }
6121                 return packFloat128(zSign, 0, 0, 0);
6122             }
6123             return packFloat128( zSign, 0, zSig0, zSig1 );
6124         }
6125         zSig2 = 0;
6126         zSig0 |= LIT64( 0x0002000000000000 );
6127         zExp = aExp;
6128         goto shiftRight1;
6129     }
6130     aSig0 |= LIT64( 0x0001000000000000 );
6131     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6132     --zExp;
6133     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6134     ++zExp;
6135  shiftRight1:
6136     shift128ExtraRightJamming(
6137         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6138  roundAndPack:
6139     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6140 
6141 }
6142 
6143 /*----------------------------------------------------------------------------
6144 | Returns the result of subtracting the absolute values of the quadruple-
6145 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6146 | difference is negated before being returned.  `zSign' is ignored if the
6147 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6148 | Standard for Binary Floating-Point Arithmetic.
6149 *----------------------------------------------------------------------------*/
6150 
6151 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6152 {
6153     int32 aExp, bExp, zExp;
6154     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6155     int32 expDiff;
6156     float128 z;
6157 
6158     aSig1 = extractFloat128Frac1( a );
6159     aSig0 = extractFloat128Frac0( a );
6160     aExp = extractFloat128Exp( a );
6161     bSig1 = extractFloat128Frac1( b );
6162     bSig0 = extractFloat128Frac0( b );
6163     bExp = extractFloat128Exp( b );
6164     expDiff = aExp - bExp;
6165     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6166     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6167     if ( 0 < expDiff ) goto aExpBigger;
6168     if ( expDiff < 0 ) goto bExpBigger;
6169     if ( aExp == 0x7FFF ) {
6170         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6171             return propagateFloat128NaN( a, b STATUS_VAR );
6172         }
6173         float_raise( float_flag_invalid STATUS_VAR);
6174         z.low = float128_default_nan_low;
6175         z.high = float128_default_nan_high;
6176         return z;
6177     }
6178     if ( aExp == 0 ) {
6179         aExp = 1;
6180         bExp = 1;
6181     }
6182     if ( bSig0 < aSig0 ) goto aBigger;
6183     if ( aSig0 < bSig0 ) goto bBigger;
6184     if ( bSig1 < aSig1 ) goto aBigger;
6185     if ( aSig1 < bSig1 ) goto bBigger;
6186     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6187  bExpBigger:
6188     if ( bExp == 0x7FFF ) {
6189         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6190         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6191     }
6192     if ( aExp == 0 ) {
6193         ++expDiff;
6194     }
6195     else {
6196         aSig0 |= LIT64( 0x4000000000000000 );
6197     }
6198     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6199     bSig0 |= LIT64( 0x4000000000000000 );
6200  bBigger:
6201     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6202     zExp = bExp;
6203     zSign ^= 1;
6204     goto normalizeRoundAndPack;
6205  aExpBigger:
6206     if ( aExp == 0x7FFF ) {
6207         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6208         return a;
6209     }
6210     if ( bExp == 0 ) {
6211         --expDiff;
6212     }
6213     else {
6214         bSig0 |= LIT64( 0x4000000000000000 );
6215     }
6216     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6217     aSig0 |= LIT64( 0x4000000000000000 );
6218  aBigger:
6219     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6220     zExp = aExp;
6221  normalizeRoundAndPack:
6222     --zExp;
6223     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6224 
6225 }
6226 
6227 /*----------------------------------------------------------------------------
6228 | Returns the result of adding the quadruple-precision floating-point values
6229 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6230 | for Binary Floating-Point Arithmetic.
6231 *----------------------------------------------------------------------------*/
6232 
6233 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6234 {
6235     flag aSign, bSign;
6236 
6237     aSign = extractFloat128Sign( a );
6238     bSign = extractFloat128Sign( b );
6239     if ( aSign == bSign ) {
6240         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6241     }
6242     else {
6243         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6244     }
6245 
6246 }
6247 
6248 /*----------------------------------------------------------------------------
6249 | Returns the result of subtracting the quadruple-precision floating-point
6250 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6251 | Standard for Binary Floating-Point Arithmetic.
6252 *----------------------------------------------------------------------------*/
6253 
6254 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6255 {
6256     flag aSign, bSign;
6257 
6258     aSign = extractFloat128Sign( a );
6259     bSign = extractFloat128Sign( b );
6260     if ( aSign == bSign ) {
6261         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6262     }
6263     else {
6264         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6265     }
6266 
6267 }
6268 
6269 /*----------------------------------------------------------------------------
6270 | Returns the result of multiplying the quadruple-precision floating-point
6271 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6272 | Standard for Binary Floating-Point Arithmetic.
6273 *----------------------------------------------------------------------------*/
6274 
6275 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6276 {
6277     flag aSign, bSign, zSign;
6278     int32 aExp, bExp, zExp;
6279     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6280     float128 z;
6281 
6282     aSig1 = extractFloat128Frac1( a );
6283     aSig0 = extractFloat128Frac0( a );
6284     aExp = extractFloat128Exp( a );
6285     aSign = extractFloat128Sign( a );
6286     bSig1 = extractFloat128Frac1( b );
6287     bSig0 = extractFloat128Frac0( b );
6288     bExp = extractFloat128Exp( b );
6289     bSign = extractFloat128Sign( b );
6290     zSign = aSign ^ bSign;
6291     if ( aExp == 0x7FFF ) {
6292         if (    ( aSig0 | aSig1 )
6293              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6294             return propagateFloat128NaN( a, b STATUS_VAR );
6295         }
6296         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6297         return packFloat128( zSign, 0x7FFF, 0, 0 );
6298     }
6299     if ( bExp == 0x7FFF ) {
6300         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6301         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6302  invalid:
6303             float_raise( float_flag_invalid STATUS_VAR);
6304             z.low = float128_default_nan_low;
6305             z.high = float128_default_nan_high;
6306             return z;
6307         }
6308         return packFloat128( zSign, 0x7FFF, 0, 0 );
6309     }
6310     if ( aExp == 0 ) {
6311         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6312         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6313     }
6314     if ( bExp == 0 ) {
6315         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6316         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6317     }
6318     zExp = aExp + bExp - 0x4000;
6319     aSig0 |= LIT64( 0x0001000000000000 );
6320     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6321     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6322     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6323     zSig2 |= ( zSig3 != 0 );
6324     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6325         shift128ExtraRightJamming(
6326             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6327         ++zExp;
6328     }
6329     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6330 
6331 }
6332 
6333 /*----------------------------------------------------------------------------
6334 | Returns the result of dividing the quadruple-precision floating-point value
6335 | `a' by the corresponding value `b'.  The operation is performed according to
6336 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6337 *----------------------------------------------------------------------------*/
6338 
6339 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6340 {
6341     flag aSign, bSign, zSign;
6342     int32 aExp, bExp, zExp;
6343     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6344     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6345     float128 z;
6346 
6347     aSig1 = extractFloat128Frac1( a );
6348     aSig0 = extractFloat128Frac0( a );
6349     aExp = extractFloat128Exp( a );
6350     aSign = extractFloat128Sign( a );
6351     bSig1 = extractFloat128Frac1( b );
6352     bSig0 = extractFloat128Frac0( b );
6353     bExp = extractFloat128Exp( b );
6354     bSign = extractFloat128Sign( b );
6355     zSign = aSign ^ bSign;
6356     if ( aExp == 0x7FFF ) {
6357         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6358         if ( bExp == 0x7FFF ) {
6359             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6360             goto invalid;
6361         }
6362         return packFloat128( zSign, 0x7FFF, 0, 0 );
6363     }
6364     if ( bExp == 0x7FFF ) {
6365         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6366         return packFloat128( zSign, 0, 0, 0 );
6367     }
6368     if ( bExp == 0 ) {
6369         if ( ( bSig0 | bSig1 ) == 0 ) {
6370             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6371  invalid:
6372                 float_raise( float_flag_invalid STATUS_VAR);
6373                 z.low = float128_default_nan_low;
6374                 z.high = float128_default_nan_high;
6375                 return z;
6376             }
6377             float_raise( float_flag_divbyzero STATUS_VAR);
6378             return packFloat128( zSign, 0x7FFF, 0, 0 );
6379         }
6380         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6381     }
6382     if ( aExp == 0 ) {
6383         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6384         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6385     }
6386     zExp = aExp - bExp + 0x3FFD;
6387     shortShift128Left(
6388         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6389     shortShift128Left(
6390         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6391     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6392         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6393         ++zExp;
6394     }
6395     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6396     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6397     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6398     while ( (int64_t) rem0 < 0 ) {
6399         --zSig0;
6400         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6401     }
6402     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6403     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6404         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6405         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6406         while ( (int64_t) rem1 < 0 ) {
6407             --zSig1;
6408             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6409         }
6410         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6411     }
6412     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6413     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6414 
6415 }
6416 
6417 /*----------------------------------------------------------------------------
6418 | Returns the remainder of the quadruple-precision floating-point value `a'
6419 | with respect to the corresponding value `b'.  The operation is performed
6420 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6421 *----------------------------------------------------------------------------*/
6422 
6423 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6424 {
6425     flag aSign, zSign;
6426     int32 aExp, bExp, expDiff;
6427     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6428     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6429     int64_t sigMean0;
6430     float128 z;
6431 
6432     aSig1 = extractFloat128Frac1( a );
6433     aSig0 = extractFloat128Frac0( a );
6434     aExp = extractFloat128Exp( a );
6435     aSign = extractFloat128Sign( a );
6436     bSig1 = extractFloat128Frac1( b );
6437     bSig0 = extractFloat128Frac0( b );
6438     bExp = extractFloat128Exp( b );
6439     if ( aExp == 0x7FFF ) {
6440         if (    ( aSig0 | aSig1 )
6441              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6442             return propagateFloat128NaN( a, b STATUS_VAR );
6443         }
6444         goto invalid;
6445     }
6446     if ( bExp == 0x7FFF ) {
6447         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6448         return a;
6449     }
6450     if ( bExp == 0 ) {
6451         if ( ( bSig0 | bSig1 ) == 0 ) {
6452  invalid:
6453             float_raise( float_flag_invalid STATUS_VAR);
6454             z.low = float128_default_nan_low;
6455             z.high = float128_default_nan_high;
6456             return z;
6457         }
6458         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6459     }
6460     if ( aExp == 0 ) {
6461         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6462         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6463     }
6464     expDiff = aExp - bExp;
6465     if ( expDiff < -1 ) return a;
6466     shortShift128Left(
6467         aSig0 | LIT64( 0x0001000000000000 ),
6468         aSig1,
6469         15 - ( expDiff < 0 ),
6470         &aSig0,
6471         &aSig1
6472     );
6473     shortShift128Left(
6474         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6475     q = le128( bSig0, bSig1, aSig0, aSig1 );
6476     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6477     expDiff -= 64;
6478     while ( 0 < expDiff ) {
6479         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6480         q = ( 4 < q ) ? q - 4 : 0;
6481         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6482         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6483         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6484         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6485         expDiff -= 61;
6486     }
6487     if ( -64 < expDiff ) {
6488         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6489         q = ( 4 < q ) ? q - 4 : 0;
6490         q >>= - expDiff;
6491         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6492         expDiff += 52;
6493         if ( expDiff < 0 ) {
6494             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6495         }
6496         else {
6497             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6498         }
6499         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6500         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6501     }
6502     else {
6503         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6504         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6505     }
6506     do {
6507         alternateASig0 = aSig0;
6508         alternateASig1 = aSig1;
6509         ++q;
6510         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6511     } while ( 0 <= (int64_t) aSig0 );
6512     add128(
6513         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6514     if (    ( sigMean0 < 0 )
6515          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6516         aSig0 = alternateASig0;
6517         aSig1 = alternateASig1;
6518     }
6519     zSign = ( (int64_t) aSig0 < 0 );
6520     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6521     return
6522         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6523 
6524 }
6525 
6526 /*----------------------------------------------------------------------------
6527 | Returns the square root of the quadruple-precision floating-point value `a'.
6528 | The operation is performed according to the IEC/IEEE Standard for Binary
6529 | Floating-Point Arithmetic.
6530 *----------------------------------------------------------------------------*/
6531 
6532 float128 float128_sqrt( float128 a STATUS_PARAM )
6533 {
6534     flag aSign;
6535     int32 aExp, zExp;
6536     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6537     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6538     float128 z;
6539 
6540     aSig1 = extractFloat128Frac1( a );
6541     aSig0 = extractFloat128Frac0( a );
6542     aExp = extractFloat128Exp( a );
6543     aSign = extractFloat128Sign( a );
6544     if ( aExp == 0x7FFF ) {
6545         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6546         if ( ! aSign ) return a;
6547         goto invalid;
6548     }
6549     if ( aSign ) {
6550         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6551  invalid:
6552         float_raise( float_flag_invalid STATUS_VAR);
6553         z.low = float128_default_nan_low;
6554         z.high = float128_default_nan_high;
6555         return z;
6556     }
6557     if ( aExp == 0 ) {
6558         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6559         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6560     }
6561     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6562     aSig0 |= LIT64( 0x0001000000000000 );
6563     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6564     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6565     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6566     doubleZSig0 = zSig0<<1;
6567     mul64To128( zSig0, zSig0, &term0, &term1 );
6568     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6569     while ( (int64_t) rem0 < 0 ) {
6570         --zSig0;
6571         doubleZSig0 -= 2;
6572         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6573     }
6574     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6575     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6576         if ( zSig1 == 0 ) zSig1 = 1;
6577         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6578         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6579         mul64To128( zSig1, zSig1, &term2, &term3 );
6580         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6581         while ( (int64_t) rem1 < 0 ) {
6582             --zSig1;
6583             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6584             term3 |= 1;
6585             term2 |= doubleZSig0;
6586             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6587         }
6588         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6589     }
6590     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6591     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6592 
6593 }
6594 
6595 /*----------------------------------------------------------------------------
6596 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6597 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6598 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6599 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6600 *----------------------------------------------------------------------------*/
6601 
6602 int float128_eq( float128 a, float128 b STATUS_PARAM )
6603 {
6604 
6605     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6606               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6607          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6608               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6609        ) {
6610         float_raise( float_flag_invalid STATUS_VAR);
6611         return 0;
6612     }
6613     return
6614            ( a.low == b.low )
6615         && (    ( a.high == b.high )
6616              || (    ( a.low == 0 )
6617                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6618            );
6619 
6620 }
6621 
6622 /*----------------------------------------------------------------------------
6623 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6624 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6625 | exception is raised if either operand is a NaN.  The comparison is performed
6626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6627 *----------------------------------------------------------------------------*/
6628 
6629 int float128_le( float128 a, float128 b STATUS_PARAM )
6630 {
6631     flag aSign, bSign;
6632 
6633     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6634               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6635          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6636               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6637        ) {
6638         float_raise( float_flag_invalid STATUS_VAR);
6639         return 0;
6640     }
6641     aSign = extractFloat128Sign( a );
6642     bSign = extractFloat128Sign( b );
6643     if ( aSign != bSign ) {
6644         return
6645                aSign
6646             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6647                  == 0 );
6648     }
6649     return
6650           aSign ? le128( b.high, b.low, a.high, a.low )
6651         : le128( a.high, a.low, b.high, b.low );
6652 
6653 }
6654 
6655 /*----------------------------------------------------------------------------
6656 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6657 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6658 | raised if either operand is a NaN.  The comparison is performed according
6659 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6660 *----------------------------------------------------------------------------*/
6661 
6662 int float128_lt( float128 a, float128 b STATUS_PARAM )
6663 {
6664     flag aSign, bSign;
6665 
6666     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6667               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6668          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6669               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6670        ) {
6671         float_raise( float_flag_invalid STATUS_VAR);
6672         return 0;
6673     }
6674     aSign = extractFloat128Sign( a );
6675     bSign = extractFloat128Sign( b );
6676     if ( aSign != bSign ) {
6677         return
6678                aSign
6679             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6680                  != 0 );
6681     }
6682     return
6683           aSign ? lt128( b.high, b.low, a.high, a.low )
6684         : lt128( a.high, a.low, b.high, b.low );
6685 
6686 }
6687 
6688 /*----------------------------------------------------------------------------
6689 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6690 | be compared, and 0 otherwise.  The invalid exception is raised if either
6691 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6692 | Standard for Binary Floating-Point Arithmetic.
6693 *----------------------------------------------------------------------------*/
6694 
6695 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6696 {
6697     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6698               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6699          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6700               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6701        ) {
6702         float_raise( float_flag_invalid STATUS_VAR);
6703         return 1;
6704     }
6705     return 0;
6706 }
6707 
6708 /*----------------------------------------------------------------------------
6709 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6710 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6711 | exception.  The comparison is performed according to the IEC/IEEE Standard
6712 | for Binary Floating-Point Arithmetic.
6713 *----------------------------------------------------------------------------*/
6714 
6715 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6716 {
6717 
6718     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6719               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6720          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6721               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6722        ) {
6723         if (    float128_is_signaling_nan( a )
6724              || float128_is_signaling_nan( b ) ) {
6725             float_raise( float_flag_invalid STATUS_VAR);
6726         }
6727         return 0;
6728     }
6729     return
6730            ( a.low == b.low )
6731         && (    ( a.high == b.high )
6732              || (    ( a.low == 0 )
6733                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6734            );
6735 
6736 }
6737 
6738 /*----------------------------------------------------------------------------
6739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6740 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6741 | cause an exception.  Otherwise, the comparison is performed according to the
6742 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6743 *----------------------------------------------------------------------------*/
6744 
6745 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6746 {
6747     flag aSign, bSign;
6748 
6749     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6750               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6751          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6752               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6753        ) {
6754         if (    float128_is_signaling_nan( a )
6755              || float128_is_signaling_nan( b ) ) {
6756             float_raise( float_flag_invalid STATUS_VAR);
6757         }
6758         return 0;
6759     }
6760     aSign = extractFloat128Sign( a );
6761     bSign = extractFloat128Sign( b );
6762     if ( aSign != bSign ) {
6763         return
6764                aSign
6765             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6766                  == 0 );
6767     }
6768     return
6769           aSign ? le128( b.high, b.low, a.high, a.low )
6770         : le128( a.high, a.low, b.high, b.low );
6771 
6772 }
6773 
6774 /*----------------------------------------------------------------------------
6775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6776 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6777 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6778 | Standard for Binary Floating-Point Arithmetic.
6779 *----------------------------------------------------------------------------*/
6780 
6781 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6782 {
6783     flag aSign, bSign;
6784 
6785     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6786               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6787          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6788               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6789        ) {
6790         if (    float128_is_signaling_nan( a )
6791              || float128_is_signaling_nan( b ) ) {
6792             float_raise( float_flag_invalid STATUS_VAR);
6793         }
6794         return 0;
6795     }
6796     aSign = extractFloat128Sign( a );
6797     bSign = extractFloat128Sign( b );
6798     if ( aSign != bSign ) {
6799         return
6800                aSign
6801             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6802                  != 0 );
6803     }
6804     return
6805           aSign ? lt128( b.high, b.low, a.high, a.low )
6806         : lt128( a.high, a.low, b.high, b.low );
6807 
6808 }
6809 
6810 /*----------------------------------------------------------------------------
6811 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6812 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6813 | comparison is performed according to the IEC/IEEE Standard for Binary
6814 | Floating-Point Arithmetic.
6815 *----------------------------------------------------------------------------*/
6816 
6817 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6818 {
6819     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6820               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6821          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6822               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6823        ) {
6824         if (    float128_is_signaling_nan( a )
6825              || float128_is_signaling_nan( b ) ) {
6826             float_raise( float_flag_invalid STATUS_VAR);
6827         }
6828         return 1;
6829     }
6830     return 0;
6831 }
6832 
6833 /* misc functions */
6834 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6835 {
6836     return int64_to_float32(a STATUS_VAR);
6837 }
6838 
6839 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6840 {
6841     return int64_to_float64(a STATUS_VAR);
6842 }
6843 
6844 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6845 {
6846     int64_t v;
6847     uint32 res;
6848     int old_exc_flags = get_float_exception_flags(status);
6849 
6850     v = float32_to_int64(a STATUS_VAR);
6851     if (v < 0) {
6852         res = 0;
6853     } else if (v > 0xffffffff) {
6854         res = 0xffffffff;
6855     } else {
6856         return v;
6857     }
6858     set_float_exception_flags(old_exc_flags, status);
6859     float_raise(float_flag_invalid STATUS_VAR);
6860     return res;
6861 }
6862 
6863 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6864 {
6865     int64_t v;
6866     uint32 res;
6867     int old_exc_flags = get_float_exception_flags(status);
6868 
6869     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6870     if (v < 0) {
6871         res = 0;
6872     } else if (v > 0xffffffff) {
6873         res = 0xffffffff;
6874     } else {
6875         return v;
6876     }
6877     set_float_exception_flags(old_exc_flags, status);
6878     float_raise(float_flag_invalid STATUS_VAR);
6879     return res;
6880 }
6881 
6882 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6883 {
6884     int32_t v;
6885     int_fast16_t res;
6886     int old_exc_flags = get_float_exception_flags(status);
6887 
6888     v = float32_to_int32(a STATUS_VAR);
6889     if (v < -0x8000) {
6890         res = -0x8000;
6891     } else if (v > 0x7fff) {
6892         res = 0x7fff;
6893     } else {
6894         return v;
6895     }
6896 
6897     set_float_exception_flags(old_exc_flags, status);
6898     float_raise(float_flag_invalid STATUS_VAR);
6899     return res;
6900 }
6901 
6902 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6903 {
6904     int32_t v;
6905     uint_fast16_t res;
6906     int old_exc_flags = get_float_exception_flags(status);
6907 
6908     v = float32_to_int32(a STATUS_VAR);
6909     if (v < 0) {
6910         res = 0;
6911     } else if (v > 0xffff) {
6912         res = 0xffff;
6913     } else {
6914         return v;
6915     }
6916 
6917     set_float_exception_flags(old_exc_flags, status);
6918     float_raise(float_flag_invalid STATUS_VAR);
6919     return res;
6920 }
6921 
6922 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6923 {
6924     int64_t v;
6925     uint_fast16_t res;
6926     int old_exc_flags = get_float_exception_flags(status);
6927 
6928     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6929     if (v < 0) {
6930         res = 0;
6931     } else if (v > 0xffff) {
6932         res = 0xffff;
6933     } else {
6934         return v;
6935     }
6936     set_float_exception_flags(old_exc_flags, status);
6937     float_raise(float_flag_invalid STATUS_VAR);
6938     return res;
6939 }
6940 
6941 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6942 {
6943     uint64_t v;
6944     uint32 res;
6945     int old_exc_flags = get_float_exception_flags(status);
6946 
6947     v = float64_to_uint64(a STATUS_VAR);
6948     if (v > 0xffffffff) {
6949         res = 0xffffffff;
6950     } else {
6951         return v;
6952     }
6953     set_float_exception_flags(old_exc_flags, status);
6954     float_raise(float_flag_invalid STATUS_VAR);
6955     return res;
6956 }
6957 
6958 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6959 {
6960     uint64_t v;
6961     uint32 res;
6962     int old_exc_flags = get_float_exception_flags(status);
6963 
6964     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6965     if (v > 0xffffffff) {
6966         res = 0xffffffff;
6967     } else {
6968         return v;
6969     }
6970     set_float_exception_flags(old_exc_flags, status);
6971     float_raise(float_flag_invalid STATUS_VAR);
6972     return res;
6973 }
6974 
6975 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6976 {
6977     int64_t v;
6978     int_fast16_t res;
6979     int old_exc_flags = get_float_exception_flags(status);
6980 
6981     v = float64_to_int32(a STATUS_VAR);
6982     if (v < -0x8000) {
6983         res = -0x8000;
6984     } else if (v > 0x7fff) {
6985         res = 0x7fff;
6986     } else {
6987         return v;
6988     }
6989 
6990     set_float_exception_flags(old_exc_flags, status);
6991     float_raise(float_flag_invalid STATUS_VAR);
6992     return res;
6993 }
6994 
6995 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6996 {
6997     int64_t v;
6998     uint_fast16_t res;
6999     int old_exc_flags = get_float_exception_flags(status);
7000 
7001     v = float64_to_int32(a STATUS_VAR);
7002     if (v < 0) {
7003         res = 0;
7004     } else if (v > 0xffff) {
7005         res = 0xffff;
7006     } else {
7007         return v;
7008     }
7009 
7010     set_float_exception_flags(old_exc_flags, status);
7011     float_raise(float_flag_invalid STATUS_VAR);
7012     return res;
7013 }
7014 
7015 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
7016 {
7017     int64_t v;
7018     uint_fast16_t res;
7019     int old_exc_flags = get_float_exception_flags(status);
7020 
7021     v = float64_to_int64_round_to_zero(a STATUS_VAR);
7022     if (v < 0) {
7023         res = 0;
7024     } else if (v > 0xffff) {
7025         res = 0xffff;
7026     } else {
7027         return v;
7028     }
7029     set_float_exception_flags(old_exc_flags, status);
7030     float_raise(float_flag_invalid STATUS_VAR);
7031     return res;
7032 }
7033 
7034 /*----------------------------------------------------------------------------
7035 | Returns the result of converting the double-precision floating-point value
7036 | `a' to the 64-bit unsigned integer format.  The conversion is
7037 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7038 | Arithmetic---which means in particular that the conversion is rounded
7039 | according to the current rounding mode.  If `a' is a NaN, the largest
7040 | positive integer is returned.  If the conversion overflows, the
7041 | largest unsigned integer is returned.  If 'a' is negative, the value is
7042 | rounded and zero is returned; negative values that do not round to zero
7043 | will raise the inexact exception.
7044 *----------------------------------------------------------------------------*/
7045 
7046 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7047 {
7048     flag aSign;
7049     int_fast16_t aExp, shiftCount;
7050     uint64_t aSig, aSigExtra;
7051     a = float64_squash_input_denormal(a STATUS_VAR);
7052 
7053     aSig = extractFloat64Frac(a);
7054     aExp = extractFloat64Exp(a);
7055     aSign = extractFloat64Sign(a);
7056     if (aSign && (aExp > 1022)) {
7057         float_raise(float_flag_invalid STATUS_VAR);
7058         if (float64_is_any_nan(a)) {
7059             return LIT64(0xFFFFFFFFFFFFFFFF);
7060         } else {
7061             return 0;
7062         }
7063     }
7064     if (aExp) {
7065         aSig |= LIT64(0x0010000000000000);
7066     }
7067     shiftCount = 0x433 - aExp;
7068     if (shiftCount <= 0) {
7069         if (0x43E < aExp) {
7070             float_raise(float_flag_invalid STATUS_VAR);
7071             return LIT64(0xFFFFFFFFFFFFFFFF);
7072         }
7073         aSigExtra = 0;
7074         aSig <<= -shiftCount;
7075     } else {
7076         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7077     }
7078     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7079 }
7080 
7081 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7082 {
7083     signed char current_rounding_mode = STATUS(float_rounding_mode);
7084     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7085     int64_t v = float64_to_uint64(a STATUS_VAR);
7086     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7087     return v;
7088 }
7089 
7090 #define COMPARE(s, nan_exp)                                                  \
7091 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
7092                                       int is_quiet STATUS_PARAM )            \
7093 {                                                                            \
7094     flag aSign, bSign;                                                       \
7095     uint ## s ## _t av, bv;                                                  \
7096     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7097     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7098                                                                              \
7099     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7100          extractFloat ## s ## Frac( a ) ) ||                                 \
7101         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7102           extractFloat ## s ## Frac( b ) )) {                                \
7103         if (!is_quiet ||                                                     \
7104             float ## s ## _is_signaling_nan( a ) ||                          \
7105             float ## s ## _is_signaling_nan( b ) ) {                         \
7106             float_raise( float_flag_invalid STATUS_VAR);                     \
7107         }                                                                    \
7108         return float_relation_unordered;                                     \
7109     }                                                                        \
7110     aSign = extractFloat ## s ## Sign( a );                                  \
7111     bSign = extractFloat ## s ## Sign( b );                                  \
7112     av = float ## s ## _val(a);                                              \
7113     bv = float ## s ## _val(b);                                              \
7114     if ( aSign != bSign ) {                                                  \
7115         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7116             /* zero case */                                                  \
7117             return float_relation_equal;                                     \
7118         } else {                                                             \
7119             return 1 - (2 * aSign);                                          \
7120         }                                                                    \
7121     } else {                                                                 \
7122         if (av == bv) {                                                      \
7123             return float_relation_equal;                                     \
7124         } else {                                                             \
7125             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7126         }                                                                    \
7127     }                                                                        \
7128 }                                                                            \
7129                                                                              \
7130 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
7131 {                                                                            \
7132     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7133 }                                                                            \
7134                                                                              \
7135 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
7136 {                                                                            \
7137     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7138 }
7139 
7140 COMPARE(32, 0xff)
7141 COMPARE(64, 0x7ff)
7142 
7143 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7144                                       int is_quiet STATUS_PARAM )
7145 {
7146     flag aSign, bSign;
7147 
7148     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7149           ( extractFloatx80Frac( a )<<1 ) ) ||
7150         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7151           ( extractFloatx80Frac( b )<<1 ) )) {
7152         if (!is_quiet ||
7153             floatx80_is_signaling_nan( a ) ||
7154             floatx80_is_signaling_nan( b ) ) {
7155             float_raise( float_flag_invalid STATUS_VAR);
7156         }
7157         return float_relation_unordered;
7158     }
7159     aSign = extractFloatx80Sign( a );
7160     bSign = extractFloatx80Sign( b );
7161     if ( aSign != bSign ) {
7162 
7163         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7164              ( ( a.low | b.low ) == 0 ) ) {
7165             /* zero case */
7166             return float_relation_equal;
7167         } else {
7168             return 1 - (2 * aSign);
7169         }
7170     } else {
7171         if (a.low == b.low && a.high == b.high) {
7172             return float_relation_equal;
7173         } else {
7174             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7175         }
7176     }
7177 }
7178 
7179 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7180 {
7181     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7182 }
7183 
7184 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7185 {
7186     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7187 }
7188 
7189 INLINE int float128_compare_internal( float128 a, float128 b,
7190                                       int is_quiet STATUS_PARAM )
7191 {
7192     flag aSign, bSign;
7193 
7194     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7195           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7196         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7197           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7198         if (!is_quiet ||
7199             float128_is_signaling_nan( a ) ||
7200             float128_is_signaling_nan( b ) ) {
7201             float_raise( float_flag_invalid STATUS_VAR);
7202         }
7203         return float_relation_unordered;
7204     }
7205     aSign = extractFloat128Sign( a );
7206     bSign = extractFloat128Sign( b );
7207     if ( aSign != bSign ) {
7208         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7209             /* zero case */
7210             return float_relation_equal;
7211         } else {
7212             return 1 - (2 * aSign);
7213         }
7214     } else {
7215         if (a.low == b.low && a.high == b.high) {
7216             return float_relation_equal;
7217         } else {
7218             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7219         }
7220     }
7221 }
7222 
7223 int float128_compare( float128 a, float128 b STATUS_PARAM )
7224 {
7225     return float128_compare_internal(a, b, 0 STATUS_VAR);
7226 }
7227 
7228 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7229 {
7230     return float128_compare_internal(a, b, 1 STATUS_VAR);
7231 }
7232 
7233 /* min() and max() functions. These can't be implemented as
7234  * 'compare and pick one input' because that would mishandle
7235  * NaNs and +0 vs -0.
7236  *
7237  * minnum() and maxnum() functions. These are similar to the min()
7238  * and max() functions but if one of the arguments is a QNaN and
7239  * the other is numerical then the numerical argument is returned.
7240  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7241  * and maxNum() operations. min() and max() are the typical min/max
7242  * semantics provided by many CPUs which predate that specification.
7243  */
7244 #define MINMAX(s)                                                       \
7245 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7246                                         int ismin, int isieee STATUS_PARAM) \
7247 {                                                                       \
7248     flag aSign, bSign;                                                  \
7249     uint ## s ## _t av, bv;                                             \
7250     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7251     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7252     if (float ## s ## _is_any_nan(a) ||                                 \
7253         float ## s ## _is_any_nan(b)) {                                 \
7254         if (isieee) {                                                   \
7255             if (float ## s ## _is_quiet_nan(a) &&                       \
7256                 !float ## s ##_is_any_nan(b)) {                         \
7257                 return b;                                               \
7258             } else if (float ## s ## _is_quiet_nan(b) &&                \
7259                        !float ## s ## _is_any_nan(a)) {                 \
7260                 return a;                                               \
7261             }                                                           \
7262         }                                                               \
7263         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7264     }                                                                   \
7265     aSign = extractFloat ## s ## Sign(a);                               \
7266     bSign = extractFloat ## s ## Sign(b);                               \
7267     av = float ## s ## _val(a);                                         \
7268     bv = float ## s ## _val(b);                                         \
7269     if (aSign != bSign) {                                               \
7270         if (ismin) {                                                    \
7271             return aSign ? a : b;                                       \
7272         } else {                                                        \
7273             return aSign ? b : a;                                       \
7274         }                                                               \
7275     } else {                                                            \
7276         if (ismin) {                                                    \
7277             return (aSign ^ (av < bv)) ? a : b;                         \
7278         } else {                                                        \
7279             return (aSign ^ (av < bv)) ? b : a;                         \
7280         }                                                               \
7281     }                                                                   \
7282 }                                                                       \
7283                                                                         \
7284 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7285 {                                                                       \
7286     return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
7287 }                                                                       \
7288                                                                         \
7289 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7290 {                                                                       \
7291     return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7292 }                                                                       \
7293                                                                         \
7294 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7295 {                                                                       \
7296     return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7297 }                                                                       \
7298                                                                         \
7299 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7300 {                                                                       \
7301     return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7302 }
7303 
7304 MINMAX(32)
7305 MINMAX(64)
7306 
7307 
7308 /* Multiply A by 2 raised to the power N.  */
7309 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7310 {
7311     flag aSign;
7312     int16_t aExp;
7313     uint32_t aSig;
7314 
7315     a = float32_squash_input_denormal(a STATUS_VAR);
7316     aSig = extractFloat32Frac( a );
7317     aExp = extractFloat32Exp( a );
7318     aSign = extractFloat32Sign( a );
7319 
7320     if ( aExp == 0xFF ) {
7321         if ( aSig ) {
7322             return propagateFloat32NaN( a, a STATUS_VAR );
7323         }
7324         return a;
7325     }
7326     if (aExp != 0) {
7327         aSig |= 0x00800000;
7328     } else if (aSig == 0) {
7329         return a;
7330     } else {
7331         aExp++;
7332     }
7333 
7334     if (n > 0x200) {
7335         n = 0x200;
7336     } else if (n < -0x200) {
7337         n = -0x200;
7338     }
7339 
7340     aExp += n - 1;
7341     aSig <<= 7;
7342     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7343 }
7344 
7345 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7346 {
7347     flag aSign;
7348     int16_t aExp;
7349     uint64_t aSig;
7350 
7351     a = float64_squash_input_denormal(a STATUS_VAR);
7352     aSig = extractFloat64Frac( a );
7353     aExp = extractFloat64Exp( a );
7354     aSign = extractFloat64Sign( a );
7355 
7356     if ( aExp == 0x7FF ) {
7357         if ( aSig ) {
7358             return propagateFloat64NaN( a, a STATUS_VAR );
7359         }
7360         return a;
7361     }
7362     if (aExp != 0) {
7363         aSig |= LIT64( 0x0010000000000000 );
7364     } else if (aSig == 0) {
7365         return a;
7366     } else {
7367         aExp++;
7368     }
7369 
7370     if (n > 0x1000) {
7371         n = 0x1000;
7372     } else if (n < -0x1000) {
7373         n = -0x1000;
7374     }
7375 
7376     aExp += n - 1;
7377     aSig <<= 10;
7378     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7379 }
7380 
7381 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7382 {
7383     flag aSign;
7384     int32_t aExp;
7385     uint64_t aSig;
7386 
7387     aSig = extractFloatx80Frac( a );
7388     aExp = extractFloatx80Exp( a );
7389     aSign = extractFloatx80Sign( a );
7390 
7391     if ( aExp == 0x7FFF ) {
7392         if ( aSig<<1 ) {
7393             return propagateFloatx80NaN( a, a STATUS_VAR );
7394         }
7395         return a;
7396     }
7397 
7398     if (aExp == 0) {
7399         if (aSig == 0) {
7400             return a;
7401         }
7402         aExp++;
7403     }
7404 
7405     if (n > 0x10000) {
7406         n = 0x10000;
7407     } else if (n < -0x10000) {
7408         n = -0x10000;
7409     }
7410 
7411     aExp += n;
7412     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7413                                           aSign, aExp, aSig, 0 STATUS_VAR );
7414 }
7415 
7416 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7417 {
7418     flag aSign;
7419     int32_t aExp;
7420     uint64_t aSig0, aSig1;
7421 
7422     aSig1 = extractFloat128Frac1( a );
7423     aSig0 = extractFloat128Frac0( a );
7424     aExp = extractFloat128Exp( a );
7425     aSign = extractFloat128Sign( a );
7426     if ( aExp == 0x7FFF ) {
7427         if ( aSig0 | aSig1 ) {
7428             return propagateFloat128NaN( a, a STATUS_VAR );
7429         }
7430         return a;
7431     }
7432     if (aExp != 0) {
7433         aSig0 |= LIT64( 0x0001000000000000 );
7434     } else if (aSig0 == 0 && aSig1 == 0) {
7435         return a;
7436     } else {
7437         aExp++;
7438     }
7439 
7440     if (n > 0x10000) {
7441         n = 0x10000;
7442     } else if (n < -0x10000) {
7443         n = -0x10000;
7444     }
7445 
7446     aExp += n - 1;
7447     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7448                                           STATUS_VAR );
7449 
7450 }
7451