xref: /openbmc/qemu/fpu/softfloat.c (revision b8bcf811)
1 /*
2  * QEMU float support
3  *
4  * Derived from SoftFloat.
5  */
6 
7 /*============================================================================
8 
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
10 Package, Release 2b.
11 
12 Written by John R. Hauser.  This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704.  Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980.  The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
21 
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
30 
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
35 
36 =============================================================================*/
37 
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39  * target-dependent and needs the TARGET_* macros.
40  */
41 #include "config.h"
42 
43 #include "fpu/softfloat.h"
44 
45 /* We only need stdlib for abort() */
46 #include <stdlib.h>
47 
48 /*----------------------------------------------------------------------------
49 | Primitive arithmetic functions, including multi-word arithmetic, and
50 | division and square root approximations.  (Can be specialized to target if
51 | desired.)
52 *----------------------------------------------------------------------------*/
53 #include "softfloat-macros.h"
54 
55 /*----------------------------------------------------------------------------
56 | Functions and definitions to determine:  (1) whether tininess for underflow
57 | is detected before or after rounding by default, (2) what (if anything)
58 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
59 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
60 | are propagated from function inputs to output.  These details are target-
61 | specific.
62 *----------------------------------------------------------------------------*/
63 #include "softfloat-specialize.h"
64 
65 /*----------------------------------------------------------------------------
66 | Returns the fraction bits of the half-precision floating-point value `a'.
67 *----------------------------------------------------------------------------*/
68 
69 INLINE uint32_t extractFloat16Frac(float16 a)
70 {
71     return float16_val(a) & 0x3ff;
72 }
73 
74 /*----------------------------------------------------------------------------
75 | Returns the exponent bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
77 
78 INLINE int_fast16_t extractFloat16Exp(float16 a)
79 {
80     return (float16_val(a) >> 10) & 0x1f;
81 }
82 
83 /*----------------------------------------------------------------------------
84 | Returns the sign bit of the single-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
86 
87 INLINE flag extractFloat16Sign(float16 a)
88 {
89     return float16_val(a)>>15;
90 }
91 
92 /*----------------------------------------------------------------------------
93 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
94 | and 7, and returns the properly rounded 32-bit integer corresponding to the
95 | input.  If `zSign' is 1, the input is negated before being converted to an
96 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
97 | is simply rounded to an integer, with the inexact exception raised if the
98 | input cannot be represented exactly as an integer.  However, if the fixed-
99 | point input is too large, the invalid exception is raised and the largest
100 | positive or negative integer is returned.
101 *----------------------------------------------------------------------------*/
102 
103 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
104 {
105     int8 roundingMode;
106     flag roundNearestEven;
107     int8 roundIncrement, roundBits;
108     int32_t z;
109 
110     roundingMode = STATUS(float_rounding_mode);
111     roundNearestEven = ( roundingMode == float_round_nearest_even );
112     switch (roundingMode) {
113     case float_round_nearest_even:
114     case float_round_ties_away:
115         roundIncrement = 0x40;
116         break;
117     case float_round_to_zero:
118         roundIncrement = 0;
119         break;
120     case float_round_up:
121         roundIncrement = zSign ? 0 : 0x7f;
122         break;
123     case float_round_down:
124         roundIncrement = zSign ? 0x7f : 0;
125         break;
126     default:
127         abort();
128     }
129     roundBits = absZ & 0x7F;
130     absZ = ( absZ + roundIncrement )>>7;
131     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
132     z = absZ;
133     if ( zSign ) z = - z;
134     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
135         float_raise( float_flag_invalid STATUS_VAR);
136         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
137     }
138     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
139     return z;
140 
141 }
142 
143 /*----------------------------------------------------------------------------
144 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
145 | `absZ1', with binary point between bits 63 and 64 (between the input words),
146 | and returns the properly rounded 64-bit integer corresponding to the input.
147 | If `zSign' is 1, the input is negated before being converted to an integer.
148 | Ordinarily, the fixed-point input is simply rounded to an integer, with
149 | the inexact exception raised if the input cannot be represented exactly as
150 | an integer.  However, if the fixed-point input is too large, the invalid
151 | exception is raised and the largest positive or negative integer is
152 | returned.
153 *----------------------------------------------------------------------------*/
154 
155 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
156 {
157     int8 roundingMode;
158     flag roundNearestEven, increment;
159     int64_t z;
160 
161     roundingMode = STATUS(float_rounding_mode);
162     roundNearestEven = ( roundingMode == float_round_nearest_even );
163     switch (roundingMode) {
164     case float_round_nearest_even:
165     case float_round_ties_away:
166         increment = ((int64_t) absZ1 < 0);
167         break;
168     case float_round_to_zero:
169         increment = 0;
170         break;
171     case float_round_up:
172         increment = !zSign && absZ1;
173         break;
174     case float_round_down:
175         increment = zSign && absZ1;
176         break;
177     default:
178         abort();
179     }
180     if ( increment ) {
181         ++absZ0;
182         if ( absZ0 == 0 ) goto overflow;
183         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
184     }
185     z = absZ0;
186     if ( zSign ) z = - z;
187     if ( z && ( ( z < 0 ) ^ zSign ) ) {
188  overflow:
189         float_raise( float_flag_invalid STATUS_VAR);
190         return
191               zSign ? (int64_t) LIT64( 0x8000000000000000 )
192             : LIT64( 0x7FFFFFFFFFFFFFFF );
193     }
194     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
195     return z;
196 
197 }
198 
199 /*----------------------------------------------------------------------------
200 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
201 | `absZ1', with binary point between bits 63 and 64 (between the input words),
202 | and returns the properly rounded 64-bit unsigned integer corresponding to the
203 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
204 | with the inexact exception raised if the input cannot be represented exactly
205 | as an integer.  However, if the fixed-point input is too large, the invalid
206 | exception is raised and the largest unsigned integer is returned.
207 *----------------------------------------------------------------------------*/
208 
209 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
210                                 uint64_t absZ1 STATUS_PARAM)
211 {
212     int8 roundingMode;
213     flag roundNearestEven, increment;
214 
215     roundingMode = STATUS(float_rounding_mode);
216     roundNearestEven = (roundingMode == float_round_nearest_even);
217     switch (roundingMode) {
218     case float_round_nearest_even:
219     case float_round_ties_away:
220         increment = ((int64_t)absZ1 < 0);
221         break;
222     case float_round_to_zero:
223         increment = 0;
224         break;
225     case float_round_up:
226         increment = !zSign && absZ1;
227         break;
228     case float_round_down:
229         increment = zSign && absZ1;
230         break;
231     default:
232         abort();
233     }
234     if (increment) {
235         ++absZ0;
236         if (absZ0 == 0) {
237             float_raise(float_flag_invalid STATUS_VAR);
238             return LIT64(0xFFFFFFFFFFFFFFFF);
239         }
240         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
241     }
242 
243     if (zSign && absZ0) {
244         float_raise(float_flag_invalid STATUS_VAR);
245         return 0;
246     }
247 
248     if (absZ1) {
249         STATUS(float_exception_flags) |= float_flag_inexact;
250     }
251     return absZ0;
252 }
253 
254 /*----------------------------------------------------------------------------
255 | Returns the fraction bits of the single-precision floating-point value `a'.
256 *----------------------------------------------------------------------------*/
257 
258 INLINE uint32_t extractFloat32Frac( float32 a )
259 {
260 
261     return float32_val(a) & 0x007FFFFF;
262 
263 }
264 
265 /*----------------------------------------------------------------------------
266 | Returns the exponent bits of the single-precision floating-point value `a'.
267 *----------------------------------------------------------------------------*/
268 
269 INLINE int_fast16_t extractFloat32Exp(float32 a)
270 {
271 
272     return ( float32_val(a)>>23 ) & 0xFF;
273 
274 }
275 
276 /*----------------------------------------------------------------------------
277 | Returns the sign bit of the single-precision floating-point value `a'.
278 *----------------------------------------------------------------------------*/
279 
280 INLINE flag extractFloat32Sign( float32 a )
281 {
282 
283     return float32_val(a)>>31;
284 
285 }
286 
287 /*----------------------------------------------------------------------------
288 | If `a' is denormal and we are in flush-to-zero mode then set the
289 | input-denormal exception and return zero. Otherwise just return the value.
290 *----------------------------------------------------------------------------*/
291 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
292 {
293     if (STATUS(flush_inputs_to_zero)) {
294         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
295             float_raise(float_flag_input_denormal STATUS_VAR);
296             return make_float32(float32_val(a) & 0x80000000);
297         }
298     }
299     return a;
300 }
301 
302 /*----------------------------------------------------------------------------
303 | Normalizes the subnormal single-precision floating-point value represented
304 | by the denormalized significand `aSig'.  The normalized exponent and
305 | significand are stored at the locations pointed to by `zExpPtr' and
306 | `zSigPtr', respectively.
307 *----------------------------------------------------------------------------*/
308 
309 static void
310  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
311 {
312     int8 shiftCount;
313 
314     shiftCount = countLeadingZeros32( aSig ) - 8;
315     *zSigPtr = aSig<<shiftCount;
316     *zExpPtr = 1 - shiftCount;
317 
318 }
319 
320 /*----------------------------------------------------------------------------
321 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
322 | single-precision floating-point value, returning the result.  After being
323 | shifted into the proper positions, the three fields are simply added
324 | together to form the result.  This means that any integer portion of `zSig'
325 | will be added into the exponent.  Since a properly normalized significand
326 | will have an integer portion equal to 1, the `zExp' input should be 1 less
327 | than the desired result exponent whenever `zSig' is a complete, normalized
328 | significand.
329 *----------------------------------------------------------------------------*/
330 
331 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
332 {
333 
334     return make_float32(
335           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
336 
337 }
338 
339 /*----------------------------------------------------------------------------
340 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
341 | and significand `zSig', and returns the proper single-precision floating-
342 | point value corresponding to the abstract input.  Ordinarily, the abstract
343 | value is simply rounded and packed into the single-precision format, with
344 | the inexact exception raised if the abstract input cannot be represented
345 | exactly.  However, if the abstract value is too large, the overflow and
346 | inexact exceptions are raised and an infinity or maximal finite value is
347 | returned.  If the abstract value is too small, the input value is rounded to
348 | a subnormal number, and the underflow and inexact exceptions are raised if
349 | the abstract input cannot be represented exactly as a subnormal single-
350 | precision floating-point number.
351 |     The input significand `zSig' has its binary point between bits 30
352 | and 29, which is 7 bits to the left of the usual location.  This shifted
353 | significand must be normalized or smaller.  If `zSig' is not normalized,
354 | `zExp' must be 0; in that case, the result returned is a subnormal number,
355 | and it must not require rounding.  In the usual case that `zSig' is
356 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
357 | The handling of underflow and overflow follows the IEC/IEEE Standard for
358 | Binary Floating-Point Arithmetic.
359 *----------------------------------------------------------------------------*/
360 
361 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
362 {
363     int8 roundingMode;
364     flag roundNearestEven;
365     int8 roundIncrement, roundBits;
366     flag isTiny;
367 
368     roundingMode = STATUS(float_rounding_mode);
369     roundNearestEven = ( roundingMode == float_round_nearest_even );
370     switch (roundingMode) {
371     case float_round_nearest_even:
372     case float_round_ties_away:
373         roundIncrement = 0x40;
374         break;
375     case float_round_to_zero:
376         roundIncrement = 0;
377         break;
378     case float_round_up:
379         roundIncrement = zSign ? 0 : 0x7f;
380         break;
381     case float_round_down:
382         roundIncrement = zSign ? 0x7f : 0;
383         break;
384     default:
385         abort();
386         break;
387     }
388     roundBits = zSig & 0x7F;
389     if ( 0xFD <= (uint16_t) zExp ) {
390         if (    ( 0xFD < zExp )
391              || (    ( zExp == 0xFD )
392                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
393            ) {
394             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
395             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
396         }
397         if ( zExp < 0 ) {
398             if (STATUS(flush_to_zero)) {
399                 float_raise(float_flag_output_denormal STATUS_VAR);
400                 return packFloat32(zSign, 0, 0);
401             }
402             isTiny =
403                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
404                 || ( zExp < -1 )
405                 || ( zSig + roundIncrement < 0x80000000 );
406             shift32RightJamming( zSig, - zExp, &zSig );
407             zExp = 0;
408             roundBits = zSig & 0x7F;
409             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
410         }
411     }
412     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
413     zSig = ( zSig + roundIncrement )>>7;
414     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
415     if ( zSig == 0 ) zExp = 0;
416     return packFloat32( zSign, zExp, zSig );
417 
418 }
419 
420 /*----------------------------------------------------------------------------
421 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
422 | and significand `zSig', and returns the proper single-precision floating-
423 | point value corresponding to the abstract input.  This routine is just like
424 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
425 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
426 | floating-point exponent.
427 *----------------------------------------------------------------------------*/
428 
429 static float32
430  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
431 {
432     int8 shiftCount;
433 
434     shiftCount = countLeadingZeros32( zSig ) - 1;
435     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
436 
437 }
438 
439 /*----------------------------------------------------------------------------
440 | Returns the fraction bits of the double-precision floating-point value `a'.
441 *----------------------------------------------------------------------------*/
442 
443 INLINE uint64_t extractFloat64Frac( float64 a )
444 {
445 
446     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
447 
448 }
449 
450 /*----------------------------------------------------------------------------
451 | Returns the exponent bits of the double-precision floating-point value `a'.
452 *----------------------------------------------------------------------------*/
453 
454 INLINE int_fast16_t extractFloat64Exp(float64 a)
455 {
456 
457     return ( float64_val(a)>>52 ) & 0x7FF;
458 
459 }
460 
461 /*----------------------------------------------------------------------------
462 | Returns the sign bit of the double-precision floating-point value `a'.
463 *----------------------------------------------------------------------------*/
464 
465 INLINE flag extractFloat64Sign( float64 a )
466 {
467 
468     return float64_val(a)>>63;
469 
470 }
471 
472 /*----------------------------------------------------------------------------
473 | If `a' is denormal and we are in flush-to-zero mode then set the
474 | input-denormal exception and return zero. Otherwise just return the value.
475 *----------------------------------------------------------------------------*/
476 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
477 {
478     if (STATUS(flush_inputs_to_zero)) {
479         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
480             float_raise(float_flag_input_denormal STATUS_VAR);
481             return make_float64(float64_val(a) & (1ULL << 63));
482         }
483     }
484     return a;
485 }
486 
487 /*----------------------------------------------------------------------------
488 | Normalizes the subnormal double-precision floating-point value represented
489 | by the denormalized significand `aSig'.  The normalized exponent and
490 | significand are stored at the locations pointed to by `zExpPtr' and
491 | `zSigPtr', respectively.
492 *----------------------------------------------------------------------------*/
493 
494 static void
495  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
496 {
497     int8 shiftCount;
498 
499     shiftCount = countLeadingZeros64( aSig ) - 11;
500     *zSigPtr = aSig<<shiftCount;
501     *zExpPtr = 1 - shiftCount;
502 
503 }
504 
505 /*----------------------------------------------------------------------------
506 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
507 | double-precision floating-point value, returning the result.  After being
508 | shifted into the proper positions, the three fields are simply added
509 | together to form the result.  This means that any integer portion of `zSig'
510 | will be added into the exponent.  Since a properly normalized significand
511 | will have an integer portion equal to 1, the `zExp' input should be 1 less
512 | than the desired result exponent whenever `zSig' is a complete, normalized
513 | significand.
514 *----------------------------------------------------------------------------*/
515 
516 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
517 {
518 
519     return make_float64(
520         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
521 
522 }
523 
524 /*----------------------------------------------------------------------------
525 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
526 | and significand `zSig', and returns the proper double-precision floating-
527 | point value corresponding to the abstract input.  Ordinarily, the abstract
528 | value is simply rounded and packed into the double-precision format, with
529 | the inexact exception raised if the abstract input cannot be represented
530 | exactly.  However, if the abstract value is too large, the overflow and
531 | inexact exceptions are raised and an infinity or maximal finite value is
532 | returned.  If the abstract value is too small, the input value is rounded
533 | to a subnormal number, and the underflow and inexact exceptions are raised
534 | if the abstract input cannot be represented exactly as a subnormal double-
535 | precision floating-point number.
536 |     The input significand `zSig' has its binary point between bits 62
537 | and 61, which is 10 bits to the left of the usual location.  This shifted
538 | significand must be normalized or smaller.  If `zSig' is not normalized,
539 | `zExp' must be 0; in that case, the result returned is a subnormal number,
540 | and it must not require rounding.  In the usual case that `zSig' is
541 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
542 | The handling of underflow and overflow follows the IEC/IEEE Standard for
543 | Binary Floating-Point Arithmetic.
544 *----------------------------------------------------------------------------*/
545 
546 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
547 {
548     int8 roundingMode;
549     flag roundNearestEven;
550     int_fast16_t roundIncrement, roundBits;
551     flag isTiny;
552 
553     roundingMode = STATUS(float_rounding_mode);
554     roundNearestEven = ( roundingMode == float_round_nearest_even );
555     switch (roundingMode) {
556     case float_round_nearest_even:
557     case float_round_ties_away:
558         roundIncrement = 0x200;
559         break;
560     case float_round_to_zero:
561         roundIncrement = 0;
562         break;
563     case float_round_up:
564         roundIncrement = zSign ? 0 : 0x3ff;
565         break;
566     case float_round_down:
567         roundIncrement = zSign ? 0x3ff : 0;
568         break;
569     default:
570         abort();
571     }
572     roundBits = zSig & 0x3FF;
573     if ( 0x7FD <= (uint16_t) zExp ) {
574         if (    ( 0x7FD < zExp )
575              || (    ( zExp == 0x7FD )
576                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
577            ) {
578             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
579             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
580         }
581         if ( zExp < 0 ) {
582             if (STATUS(flush_to_zero)) {
583                 float_raise(float_flag_output_denormal STATUS_VAR);
584                 return packFloat64(zSign, 0, 0);
585             }
586             isTiny =
587                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
588                 || ( zExp < -1 )
589                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
590             shift64RightJamming( zSig, - zExp, &zSig );
591             zExp = 0;
592             roundBits = zSig & 0x3FF;
593             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
594         }
595     }
596     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
597     zSig = ( zSig + roundIncrement )>>10;
598     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
599     if ( zSig == 0 ) zExp = 0;
600     return packFloat64( zSign, zExp, zSig );
601 
602 }
603 
604 /*----------------------------------------------------------------------------
605 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
606 | and significand `zSig', and returns the proper double-precision floating-
607 | point value corresponding to the abstract input.  This routine is just like
608 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
609 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
610 | floating-point exponent.
611 *----------------------------------------------------------------------------*/
612 
613 static float64
614  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
615 {
616     int8 shiftCount;
617 
618     shiftCount = countLeadingZeros64( zSig ) - 1;
619     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
620 
621 }
622 
623 /*----------------------------------------------------------------------------
624 | Returns the fraction bits of the extended double-precision floating-point
625 | value `a'.
626 *----------------------------------------------------------------------------*/
627 
628 INLINE uint64_t extractFloatx80Frac( floatx80 a )
629 {
630 
631     return a.low;
632 
633 }
634 
635 /*----------------------------------------------------------------------------
636 | Returns the exponent bits of the extended double-precision floating-point
637 | value `a'.
638 *----------------------------------------------------------------------------*/
639 
640 INLINE int32 extractFloatx80Exp( floatx80 a )
641 {
642 
643     return a.high & 0x7FFF;
644 
645 }
646 
647 /*----------------------------------------------------------------------------
648 | Returns the sign bit of the extended double-precision floating-point value
649 | `a'.
650 *----------------------------------------------------------------------------*/
651 
652 INLINE flag extractFloatx80Sign( floatx80 a )
653 {
654 
655     return a.high>>15;
656 
657 }
658 
659 /*----------------------------------------------------------------------------
660 | Normalizes the subnormal extended double-precision floating-point value
661 | represented by the denormalized significand `aSig'.  The normalized exponent
662 | and significand are stored at the locations pointed to by `zExpPtr' and
663 | `zSigPtr', respectively.
664 *----------------------------------------------------------------------------*/
665 
666 static void
667  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
668 {
669     int8 shiftCount;
670 
671     shiftCount = countLeadingZeros64( aSig );
672     *zSigPtr = aSig<<shiftCount;
673     *zExpPtr = 1 - shiftCount;
674 
675 }
676 
677 /*----------------------------------------------------------------------------
678 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
679 | extended double-precision floating-point value, returning the result.
680 *----------------------------------------------------------------------------*/
681 
682 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
683 {
684     floatx80 z;
685 
686     z.low = zSig;
687     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
688     return z;
689 
690 }
691 
692 /*----------------------------------------------------------------------------
693 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
694 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
695 | and returns the proper extended double-precision floating-point value
696 | corresponding to the abstract input.  Ordinarily, the abstract value is
697 | rounded and packed into the extended double-precision format, with the
698 | inexact exception raised if the abstract input cannot be represented
699 | exactly.  However, if the abstract value is too large, the overflow and
700 | inexact exceptions are raised and an infinity or maximal finite value is
701 | returned.  If the abstract value is too small, the input value is rounded to
702 | a subnormal number, and the underflow and inexact exceptions are raised if
703 | the abstract input cannot be represented exactly as a subnormal extended
704 | double-precision floating-point number.
705 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
706 | number of bits as single or double precision, respectively.  Otherwise, the
707 | result is rounded to the full precision of the extended double-precision
708 | format.
709 |     The input significand must be normalized or smaller.  If the input
710 | significand is not normalized, `zExp' must be 0; in that case, the result
711 | returned is a subnormal number, and it must not require rounding.  The
712 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
713 | Floating-Point Arithmetic.
714 *----------------------------------------------------------------------------*/
715 
716 static floatx80
717  roundAndPackFloatx80(
718      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
719  STATUS_PARAM)
720 {
721     int8 roundingMode;
722     flag roundNearestEven, increment, isTiny;
723     int64 roundIncrement, roundMask, roundBits;
724 
725     roundingMode = STATUS(float_rounding_mode);
726     roundNearestEven = ( roundingMode == float_round_nearest_even );
727     if ( roundingPrecision == 80 ) goto precision80;
728     if ( roundingPrecision == 64 ) {
729         roundIncrement = LIT64( 0x0000000000000400 );
730         roundMask = LIT64( 0x00000000000007FF );
731     }
732     else if ( roundingPrecision == 32 ) {
733         roundIncrement = LIT64( 0x0000008000000000 );
734         roundMask = LIT64( 0x000000FFFFFFFFFF );
735     }
736     else {
737         goto precision80;
738     }
739     zSig0 |= ( zSig1 != 0 );
740     switch (roundingMode) {
741     case float_round_nearest_even:
742     case float_round_ties_away:
743         break;
744     case float_round_to_zero:
745         roundIncrement = 0;
746         break;
747     case float_round_up:
748         roundIncrement = zSign ? 0 : roundMask;
749         break;
750     case float_round_down:
751         roundIncrement = zSign ? roundMask : 0;
752         break;
753     default:
754         abort();
755     }
756     roundBits = zSig0 & roundMask;
757     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
758         if (    ( 0x7FFE < zExp )
759              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
760            ) {
761             goto overflow;
762         }
763         if ( zExp <= 0 ) {
764             if (STATUS(flush_to_zero)) {
765                 float_raise(float_flag_output_denormal STATUS_VAR);
766                 return packFloatx80(zSign, 0, 0);
767             }
768             isTiny =
769                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
770                 || ( zExp < 0 )
771                 || ( zSig0 <= zSig0 + roundIncrement );
772             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
773             zExp = 0;
774             roundBits = zSig0 & roundMask;
775             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
776             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
777             zSig0 += roundIncrement;
778             if ( (int64_t) zSig0 < 0 ) zExp = 1;
779             roundIncrement = roundMask + 1;
780             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
781                 roundMask |= roundIncrement;
782             }
783             zSig0 &= ~ roundMask;
784             return packFloatx80( zSign, zExp, zSig0 );
785         }
786     }
787     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
788     zSig0 += roundIncrement;
789     if ( zSig0 < roundIncrement ) {
790         ++zExp;
791         zSig0 = LIT64( 0x8000000000000000 );
792     }
793     roundIncrement = roundMask + 1;
794     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
795         roundMask |= roundIncrement;
796     }
797     zSig0 &= ~ roundMask;
798     if ( zSig0 == 0 ) zExp = 0;
799     return packFloatx80( zSign, zExp, zSig0 );
800  precision80:
801     switch (roundingMode) {
802     case float_round_nearest_even:
803     case float_round_ties_away:
804         increment = ((int64_t)zSig1 < 0);
805         break;
806     case float_round_to_zero:
807         increment = 0;
808         break;
809     case float_round_up:
810         increment = !zSign && zSig1;
811         break;
812     case float_round_down:
813         increment = zSign && zSig1;
814         break;
815     default:
816         abort();
817     }
818     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
819         if (    ( 0x7FFE < zExp )
820              || (    ( zExp == 0x7FFE )
821                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
822                   && increment
823                 )
824            ) {
825             roundMask = 0;
826  overflow:
827             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
828             if (    ( roundingMode == float_round_to_zero )
829                  || ( zSign && ( roundingMode == float_round_up ) )
830                  || ( ! zSign && ( roundingMode == float_round_down ) )
831                ) {
832                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
833             }
834             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
835         }
836         if ( zExp <= 0 ) {
837             isTiny =
838                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
839                 || ( zExp < 0 )
840                 || ! increment
841                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
842             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
843             zExp = 0;
844             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
845             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
846             switch (roundingMode) {
847             case float_round_nearest_even:
848             case float_round_ties_away:
849                 increment = ((int64_t)zSig1 < 0);
850                 break;
851             case float_round_to_zero:
852                 increment = 0;
853                 break;
854             case float_round_up:
855                 increment = !zSign && zSig1;
856                 break;
857             case float_round_down:
858                 increment = zSign && zSig1;
859                 break;
860             default:
861                 abort();
862             }
863             if ( increment ) {
864                 ++zSig0;
865                 zSig0 &=
866                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
867                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
868             }
869             return packFloatx80( zSign, zExp, zSig0 );
870         }
871     }
872     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
873     if ( increment ) {
874         ++zSig0;
875         if ( zSig0 == 0 ) {
876             ++zExp;
877             zSig0 = LIT64( 0x8000000000000000 );
878         }
879         else {
880             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
881         }
882     }
883     else {
884         if ( zSig0 == 0 ) zExp = 0;
885     }
886     return packFloatx80( zSign, zExp, zSig0 );
887 
888 }
889 
890 /*----------------------------------------------------------------------------
891 | Takes an abstract floating-point value having sign `zSign', exponent
892 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
893 | and returns the proper extended double-precision floating-point value
894 | corresponding to the abstract input.  This routine is just like
895 | `roundAndPackFloatx80' except that the input significand does not have to be
896 | normalized.
897 *----------------------------------------------------------------------------*/
898 
899 static floatx80
900  normalizeRoundAndPackFloatx80(
901      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
902  STATUS_PARAM)
903 {
904     int8 shiftCount;
905 
906     if ( zSig0 == 0 ) {
907         zSig0 = zSig1;
908         zSig1 = 0;
909         zExp -= 64;
910     }
911     shiftCount = countLeadingZeros64( zSig0 );
912     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
913     zExp -= shiftCount;
914     return
915         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
916 
917 }
918 
919 /*----------------------------------------------------------------------------
920 | Returns the least-significant 64 fraction bits of the quadruple-precision
921 | floating-point value `a'.
922 *----------------------------------------------------------------------------*/
923 
924 INLINE uint64_t extractFloat128Frac1( float128 a )
925 {
926 
927     return a.low;
928 
929 }
930 
931 /*----------------------------------------------------------------------------
932 | Returns the most-significant 48 fraction bits of the quadruple-precision
933 | floating-point value `a'.
934 *----------------------------------------------------------------------------*/
935 
936 INLINE uint64_t extractFloat128Frac0( float128 a )
937 {
938 
939     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
940 
941 }
942 
943 /*----------------------------------------------------------------------------
944 | Returns the exponent bits of the quadruple-precision floating-point value
945 | `a'.
946 *----------------------------------------------------------------------------*/
947 
948 INLINE int32 extractFloat128Exp( float128 a )
949 {
950 
951     return ( a.high>>48 ) & 0x7FFF;
952 
953 }
954 
955 /*----------------------------------------------------------------------------
956 | Returns the sign bit of the quadruple-precision floating-point value `a'.
957 *----------------------------------------------------------------------------*/
958 
959 INLINE flag extractFloat128Sign( float128 a )
960 {
961 
962     return a.high>>63;
963 
964 }
965 
966 /*----------------------------------------------------------------------------
967 | Normalizes the subnormal quadruple-precision floating-point value
968 | represented by the denormalized significand formed by the concatenation of
969 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
970 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
971 | significand are stored at the location pointed to by `zSig0Ptr', and the
972 | least significant 64 bits of the normalized significand are stored at the
973 | location pointed to by `zSig1Ptr'.
974 *----------------------------------------------------------------------------*/
975 
976 static void
977  normalizeFloat128Subnormal(
978      uint64_t aSig0,
979      uint64_t aSig1,
980      int32 *zExpPtr,
981      uint64_t *zSig0Ptr,
982      uint64_t *zSig1Ptr
983  )
984 {
985     int8 shiftCount;
986 
987     if ( aSig0 == 0 ) {
988         shiftCount = countLeadingZeros64( aSig1 ) - 15;
989         if ( shiftCount < 0 ) {
990             *zSig0Ptr = aSig1>>( - shiftCount );
991             *zSig1Ptr = aSig1<<( shiftCount & 63 );
992         }
993         else {
994             *zSig0Ptr = aSig1<<shiftCount;
995             *zSig1Ptr = 0;
996         }
997         *zExpPtr = - shiftCount - 63;
998     }
999     else {
1000         shiftCount = countLeadingZeros64( aSig0 ) - 15;
1001         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1002         *zExpPtr = 1 - shiftCount;
1003     }
1004 
1005 }
1006 
1007 /*----------------------------------------------------------------------------
1008 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1009 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1010 | floating-point value, returning the result.  After being shifted into the
1011 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1012 | added together to form the most significant 32 bits of the result.  This
1013 | means that any integer portion of `zSig0' will be added into the exponent.
1014 | Since a properly normalized significand will have an integer portion equal
1015 | to 1, the `zExp' input should be 1 less than the desired result exponent
1016 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1017 | significand.
1018 *----------------------------------------------------------------------------*/
1019 
1020 INLINE float128
1021  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1022 {
1023     float128 z;
1024 
1025     z.low = zSig1;
1026     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1027     return z;
1028 
1029 }
1030 
1031 /*----------------------------------------------------------------------------
1032 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1033 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1034 | and `zSig2', and returns the proper quadruple-precision floating-point value
1035 | corresponding to the abstract input.  Ordinarily, the abstract value is
1036 | simply rounded and packed into the quadruple-precision format, with the
1037 | inexact exception raised if the abstract input cannot be represented
1038 | exactly.  However, if the abstract value is too large, the overflow and
1039 | inexact exceptions are raised and an infinity or maximal finite value is
1040 | returned.  If the abstract value is too small, the input value is rounded to
1041 | a subnormal number, and the underflow and inexact exceptions are raised if
1042 | the abstract input cannot be represented exactly as a subnormal quadruple-
1043 | precision floating-point number.
1044 |     The input significand must be normalized or smaller.  If the input
1045 | significand is not normalized, `zExp' must be 0; in that case, the result
1046 | returned is a subnormal number, and it must not require rounding.  In the
1047 | usual case that the input significand is normalized, `zExp' must be 1 less
1048 | than the ``true'' floating-point exponent.  The handling of underflow and
1049 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1050 *----------------------------------------------------------------------------*/
1051 
1052 static float128
1053  roundAndPackFloat128(
1054      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1055 {
1056     int8 roundingMode;
1057     flag roundNearestEven, increment, isTiny;
1058 
1059     roundingMode = STATUS(float_rounding_mode);
1060     roundNearestEven = ( roundingMode == float_round_nearest_even );
1061     switch (roundingMode) {
1062     case float_round_nearest_even:
1063     case float_round_ties_away:
1064         increment = ((int64_t)zSig2 < 0);
1065         break;
1066     case float_round_to_zero:
1067         increment = 0;
1068         break;
1069     case float_round_up:
1070         increment = !zSign && zSig2;
1071         break;
1072     case float_round_down:
1073         increment = zSign && zSig2;
1074         break;
1075     default:
1076         abort();
1077     }
1078     if ( 0x7FFD <= (uint32_t) zExp ) {
1079         if (    ( 0x7FFD < zExp )
1080              || (    ( zExp == 0x7FFD )
1081                   && eq128(
1082                          LIT64( 0x0001FFFFFFFFFFFF ),
1083                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1084                          zSig0,
1085                          zSig1
1086                      )
1087                   && increment
1088                 )
1089            ) {
1090             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1091             if (    ( roundingMode == float_round_to_zero )
1092                  || ( zSign && ( roundingMode == float_round_up ) )
1093                  || ( ! zSign && ( roundingMode == float_round_down ) )
1094                ) {
1095                 return
1096                     packFloat128(
1097                         zSign,
1098                         0x7FFE,
1099                         LIT64( 0x0000FFFFFFFFFFFF ),
1100                         LIT64( 0xFFFFFFFFFFFFFFFF )
1101                     );
1102             }
1103             return packFloat128( zSign, 0x7FFF, 0, 0 );
1104         }
1105         if ( zExp < 0 ) {
1106             if (STATUS(flush_to_zero)) {
1107                 float_raise(float_flag_output_denormal STATUS_VAR);
1108                 return packFloat128(zSign, 0, 0, 0);
1109             }
1110             isTiny =
1111                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1112                 || ( zExp < -1 )
1113                 || ! increment
1114                 || lt128(
1115                        zSig0,
1116                        zSig1,
1117                        LIT64( 0x0001FFFFFFFFFFFF ),
1118                        LIT64( 0xFFFFFFFFFFFFFFFF )
1119                    );
1120             shift128ExtraRightJamming(
1121                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1122             zExp = 0;
1123             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1124             switch (roundingMode) {
1125             case float_round_nearest_even:
1126             case float_round_ties_away:
1127                 increment = ((int64_t)zSig2 < 0);
1128                 break;
1129             case float_round_to_zero:
1130                 increment = 0;
1131                 break;
1132             case float_round_up:
1133                 increment = !zSign && zSig2;
1134                 break;
1135             case float_round_down:
1136                 increment = zSign && zSig2;
1137                 break;
1138             default:
1139                 abort();
1140             }
1141         }
1142     }
1143     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1144     if ( increment ) {
1145         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1146         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1147     }
1148     else {
1149         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1150     }
1151     return packFloat128( zSign, zExp, zSig0, zSig1 );
1152 
1153 }
1154 
1155 /*----------------------------------------------------------------------------
1156 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1157 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1158 | returns the proper quadruple-precision floating-point value corresponding
1159 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1160 | except that the input significand has fewer bits and does not have to be
1161 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1162 | point exponent.
1163 *----------------------------------------------------------------------------*/
1164 
1165 static float128
1166  normalizeRoundAndPackFloat128(
1167      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1168 {
1169     int8 shiftCount;
1170     uint64_t zSig2;
1171 
1172     if ( zSig0 == 0 ) {
1173         zSig0 = zSig1;
1174         zSig1 = 0;
1175         zExp -= 64;
1176     }
1177     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1178     if ( 0 <= shiftCount ) {
1179         zSig2 = 0;
1180         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1181     }
1182     else {
1183         shift128ExtraRightJamming(
1184             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1185     }
1186     zExp -= shiftCount;
1187     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1188 
1189 }
1190 
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a'
1193 | to the single-precision floating-point format.  The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196 
1197 float32 int32_to_float32(int32_t a STATUS_PARAM)
1198 {
1199     flag zSign;
1200 
1201     if ( a == 0 ) return float32_zero;
1202     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1203     zSign = ( a < 0 );
1204     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1205 
1206 }
1207 
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 32-bit two's complement integer `a'
1210 | to the double-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213 
1214 float64 int32_to_float64(int32_t a STATUS_PARAM)
1215 {
1216     flag zSign;
1217     uint32 absA;
1218     int8 shiftCount;
1219     uint64_t zSig;
1220 
1221     if ( a == 0 ) return float64_zero;
1222     zSign = ( a < 0 );
1223     absA = zSign ? - a : a;
1224     shiftCount = countLeadingZeros32( absA ) + 21;
1225     zSig = absA;
1226     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1227 
1228 }
1229 
1230 /*----------------------------------------------------------------------------
1231 | Returns the result of converting the 32-bit two's complement integer `a'
1232 | to the extended double-precision floating-point format.  The conversion
1233 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1234 | Arithmetic.
1235 *----------------------------------------------------------------------------*/
1236 
1237 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1238 {
1239     flag zSign;
1240     uint32 absA;
1241     int8 shiftCount;
1242     uint64_t zSig;
1243 
1244     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1245     zSign = ( a < 0 );
1246     absA = zSign ? - a : a;
1247     shiftCount = countLeadingZeros32( absA ) + 32;
1248     zSig = absA;
1249     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1250 
1251 }
1252 
1253 /*----------------------------------------------------------------------------
1254 | Returns the result of converting the 32-bit two's complement integer `a' to
1255 | the quadruple-precision floating-point format.  The conversion is performed
1256 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258 
1259 float128 int32_to_float128(int32_t a STATUS_PARAM)
1260 {
1261     flag zSign;
1262     uint32 absA;
1263     int8 shiftCount;
1264     uint64_t zSig0;
1265 
1266     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1267     zSign = ( a < 0 );
1268     absA = zSign ? - a : a;
1269     shiftCount = countLeadingZeros32( absA ) + 17;
1270     zSig0 = absA;
1271     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1272 
1273 }
1274 
1275 /*----------------------------------------------------------------------------
1276 | Returns the result of converting the 64-bit two's complement integer `a'
1277 | to the single-precision floating-point format.  The conversion is performed
1278 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1279 *----------------------------------------------------------------------------*/
1280 
1281 float32 int64_to_float32(int64_t a STATUS_PARAM)
1282 {
1283     flag zSign;
1284     uint64 absA;
1285     int8 shiftCount;
1286 
1287     if ( a == 0 ) return float32_zero;
1288     zSign = ( a < 0 );
1289     absA = zSign ? - a : a;
1290     shiftCount = countLeadingZeros64( absA ) - 40;
1291     if ( 0 <= shiftCount ) {
1292         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1293     }
1294     else {
1295         shiftCount += 7;
1296         if ( shiftCount < 0 ) {
1297             shift64RightJamming( absA, - shiftCount, &absA );
1298         }
1299         else {
1300             absA <<= shiftCount;
1301         }
1302         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1303     }
1304 
1305 }
1306 
1307 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1308 {
1309     int8 shiftCount;
1310 
1311     if ( a == 0 ) return float32_zero;
1312     shiftCount = countLeadingZeros64( a ) - 40;
1313     if ( 0 <= shiftCount ) {
1314         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1315     }
1316     else {
1317         shiftCount += 7;
1318         if ( shiftCount < 0 ) {
1319             shift64RightJamming( a, - shiftCount, &a );
1320         }
1321         else {
1322             a <<= shiftCount;
1323         }
1324         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1325     }
1326 }
1327 
1328 /*----------------------------------------------------------------------------
1329 | Returns the result of converting the 64-bit two's complement integer `a'
1330 | to the double-precision floating-point format.  The conversion is performed
1331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1332 *----------------------------------------------------------------------------*/
1333 
1334 float64 int64_to_float64(int64_t a STATUS_PARAM)
1335 {
1336     flag zSign;
1337 
1338     if ( a == 0 ) return float64_zero;
1339     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1340         return packFloat64( 1, 0x43E, 0 );
1341     }
1342     zSign = ( a < 0 );
1343     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1344 
1345 }
1346 
1347 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1348 {
1349     int exp =  0x43C;
1350 
1351     if (a == 0) {
1352         return float64_zero;
1353     }
1354     if ((int64_t)a < 0) {
1355         shift64RightJamming(a, 1, &a);
1356         exp += 1;
1357     }
1358     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1359 }
1360 
1361 /*----------------------------------------------------------------------------
1362 | Returns the result of converting the 64-bit two's complement integer `a'
1363 | to the extended double-precision floating-point format.  The conversion
1364 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1365 | Arithmetic.
1366 *----------------------------------------------------------------------------*/
1367 
1368 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1369 {
1370     flag zSign;
1371     uint64 absA;
1372     int8 shiftCount;
1373 
1374     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1375     zSign = ( a < 0 );
1376     absA = zSign ? - a : a;
1377     shiftCount = countLeadingZeros64( absA );
1378     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1379 
1380 }
1381 
1382 /*----------------------------------------------------------------------------
1383 | Returns the result of converting the 64-bit two's complement integer `a' to
1384 | the quadruple-precision floating-point format.  The conversion is performed
1385 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1386 *----------------------------------------------------------------------------*/
1387 
1388 float128 int64_to_float128(int64_t a STATUS_PARAM)
1389 {
1390     flag zSign;
1391     uint64 absA;
1392     int8 shiftCount;
1393     int32 zExp;
1394     uint64_t zSig0, zSig1;
1395 
1396     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1397     zSign = ( a < 0 );
1398     absA = zSign ? - a : a;
1399     shiftCount = countLeadingZeros64( absA ) + 49;
1400     zExp = 0x406E - shiftCount;
1401     if ( 64 <= shiftCount ) {
1402         zSig1 = 0;
1403         zSig0 = absA;
1404         shiftCount -= 64;
1405     }
1406     else {
1407         zSig1 = absA;
1408         zSig0 = 0;
1409     }
1410     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1411     return packFloat128( zSign, zExp, zSig0, zSig1 );
1412 
1413 }
1414 
1415 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1416 {
1417     if (a == 0) {
1418         return float128_zero;
1419     }
1420     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1421 }
1422 
1423 /*----------------------------------------------------------------------------
1424 | Returns the result of converting the single-precision floating-point value
1425 | `a' to the 32-bit two's complement integer format.  The conversion is
1426 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1427 | Arithmetic---which means in particular that the conversion is rounded
1428 | according to the current rounding mode.  If `a' is a NaN, the largest
1429 | positive integer is returned.  Otherwise, if the conversion overflows, the
1430 | largest integer with the same sign as `a' is returned.
1431 *----------------------------------------------------------------------------*/
1432 
1433 int32 float32_to_int32( float32 a STATUS_PARAM )
1434 {
1435     flag aSign;
1436     int_fast16_t aExp, shiftCount;
1437     uint32_t aSig;
1438     uint64_t aSig64;
1439 
1440     a = float32_squash_input_denormal(a STATUS_VAR);
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1445     if ( aExp ) aSig |= 0x00800000;
1446     shiftCount = 0xAF - aExp;
1447     aSig64 = aSig;
1448     aSig64 <<= 32;
1449     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1450     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1451 
1452 }
1453 
1454 /*----------------------------------------------------------------------------
1455 | Returns the result of converting the single-precision floating-point value
1456 | `a' to the 32-bit two's complement integer format.  The conversion is
1457 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1458 | Arithmetic, except that the conversion is always rounded toward zero.
1459 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1460 | the conversion overflows, the largest integer with the same sign as `a' is
1461 | returned.
1462 *----------------------------------------------------------------------------*/
1463 
1464 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1465 {
1466     flag aSign;
1467     int_fast16_t aExp, shiftCount;
1468     uint32_t aSig;
1469     int32_t z;
1470     a = float32_squash_input_denormal(a STATUS_VAR);
1471 
1472     aSig = extractFloat32Frac( a );
1473     aExp = extractFloat32Exp( a );
1474     aSign = extractFloat32Sign( a );
1475     shiftCount = aExp - 0x9E;
1476     if ( 0 <= shiftCount ) {
1477         if ( float32_val(a) != 0xCF000000 ) {
1478             float_raise( float_flag_invalid STATUS_VAR);
1479             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1480         }
1481         return (int32_t) 0x80000000;
1482     }
1483     else if ( aExp <= 0x7E ) {
1484         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1485         return 0;
1486     }
1487     aSig = ( aSig | 0x00800000 )<<8;
1488     z = aSig>>( - shiftCount );
1489     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1490         STATUS(float_exception_flags) |= float_flag_inexact;
1491     }
1492     if ( aSign ) z = - z;
1493     return z;
1494 
1495 }
1496 
1497 /*----------------------------------------------------------------------------
1498 | Returns the result of converting the single-precision floating-point value
1499 | `a' to the 16-bit two's complement integer format.  The conversion is
1500 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1501 | Arithmetic, except that the conversion is always rounded toward zero.
1502 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1503 | the conversion overflows, the largest integer with the same sign as `a' is
1504 | returned.
1505 *----------------------------------------------------------------------------*/
1506 
1507 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1508 {
1509     flag aSign;
1510     int_fast16_t aExp, shiftCount;
1511     uint32_t aSig;
1512     int32 z;
1513 
1514     aSig = extractFloat32Frac( a );
1515     aExp = extractFloat32Exp( a );
1516     aSign = extractFloat32Sign( a );
1517     shiftCount = aExp - 0x8E;
1518     if ( 0 <= shiftCount ) {
1519         if ( float32_val(a) != 0xC7000000 ) {
1520             float_raise( float_flag_invalid STATUS_VAR);
1521             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1522                 return 0x7FFF;
1523             }
1524         }
1525         return (int32_t) 0xffff8000;
1526     }
1527     else if ( aExp <= 0x7E ) {
1528         if ( aExp | aSig ) {
1529             STATUS(float_exception_flags) |= float_flag_inexact;
1530         }
1531         return 0;
1532     }
1533     shiftCount -= 0x10;
1534     aSig = ( aSig | 0x00800000 )<<8;
1535     z = aSig>>( - shiftCount );
1536     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1537         STATUS(float_exception_flags) |= float_flag_inexact;
1538     }
1539     if ( aSign ) {
1540         z = - z;
1541     }
1542     return z;
1543 
1544 }
1545 
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the 64-bit two's complement integer format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic---which means in particular that the conversion is rounded
1551 | according to the current rounding mode.  If `a' is a NaN, the largest
1552 | positive integer is returned.  Otherwise, if the conversion overflows, the
1553 | largest integer with the same sign as `a' is returned.
1554 *----------------------------------------------------------------------------*/
1555 
1556 int64 float32_to_int64( float32 a STATUS_PARAM )
1557 {
1558     flag aSign;
1559     int_fast16_t aExp, shiftCount;
1560     uint32_t aSig;
1561     uint64_t aSig64, aSigExtra;
1562     a = float32_squash_input_denormal(a STATUS_VAR);
1563 
1564     aSig = extractFloat32Frac( a );
1565     aExp = extractFloat32Exp( a );
1566     aSign = extractFloat32Sign( a );
1567     shiftCount = 0xBE - aExp;
1568     if ( shiftCount < 0 ) {
1569         float_raise( float_flag_invalid STATUS_VAR);
1570         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1571             return LIT64( 0x7FFFFFFFFFFFFFFF );
1572         }
1573         return (int64_t) LIT64( 0x8000000000000000 );
1574     }
1575     if ( aExp ) aSig |= 0x00800000;
1576     aSig64 = aSig;
1577     aSig64 <<= 40;
1578     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1579     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1580 
1581 }
1582 
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the 64-bit unsigned integer format.  The conversion is
1586 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic---which means in particular that the conversion is rounded
1588 | according to the current rounding mode.  If `a' is a NaN, the largest
1589 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1590 | largest unsigned integer is returned.  If the 'a' is negative, the result
1591 | is rounded and zero is returned; values that do not round to zero will
1592 | raise the inexact exception flag.
1593 *----------------------------------------------------------------------------*/
1594 
1595 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1596 {
1597     flag aSign;
1598     int_fast16_t aExp, shiftCount;
1599     uint32_t aSig;
1600     uint64_t aSig64, aSigExtra;
1601     a = float32_squash_input_denormal(a STATUS_VAR);
1602 
1603     aSig = extractFloat32Frac(a);
1604     aExp = extractFloat32Exp(a);
1605     aSign = extractFloat32Sign(a);
1606     if ((aSign) && (aExp > 126)) {
1607         float_raise(float_flag_invalid STATUS_VAR);
1608         if (float32_is_any_nan(a)) {
1609             return LIT64(0xFFFFFFFFFFFFFFFF);
1610         } else {
1611             return 0;
1612         }
1613     }
1614     shiftCount = 0xBE - aExp;
1615     if (aExp) {
1616         aSig |= 0x00800000;
1617     }
1618     if (shiftCount < 0) {
1619         float_raise(float_flag_invalid STATUS_VAR);
1620         return LIT64(0xFFFFFFFFFFFFFFFF);
1621     }
1622 
1623     aSig64 = aSig;
1624     aSig64 <<= 40;
1625     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1626     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1627 }
1628 
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the 64-bit two's complement integer format.  The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic, except that the conversion is always rounded toward zero.  If
1634 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1635 | conversion overflows, the largest integer with the same sign as `a' is
1636 | returned.
1637 *----------------------------------------------------------------------------*/
1638 
1639 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1640 {
1641     flag aSign;
1642     int_fast16_t aExp, shiftCount;
1643     uint32_t aSig;
1644     uint64_t aSig64;
1645     int64 z;
1646     a = float32_squash_input_denormal(a STATUS_VAR);
1647 
1648     aSig = extractFloat32Frac( a );
1649     aExp = extractFloat32Exp( a );
1650     aSign = extractFloat32Sign( a );
1651     shiftCount = aExp - 0xBE;
1652     if ( 0 <= shiftCount ) {
1653         if ( float32_val(a) != 0xDF000000 ) {
1654             float_raise( float_flag_invalid STATUS_VAR);
1655             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1656                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1657             }
1658         }
1659         return (int64_t) LIT64( 0x8000000000000000 );
1660     }
1661     else if ( aExp <= 0x7E ) {
1662         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1663         return 0;
1664     }
1665     aSig64 = aSig | 0x00800000;
1666     aSig64 <<= 40;
1667     z = aSig64>>( - shiftCount );
1668     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1669         STATUS(float_exception_flags) |= float_flag_inexact;
1670     }
1671     if ( aSign ) z = - z;
1672     return z;
1673 
1674 }
1675 
1676 /*----------------------------------------------------------------------------
1677 | Returns the result of converting the single-precision floating-point value
1678 | `a' to the double-precision floating-point format.  The conversion is
1679 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1680 | Arithmetic.
1681 *----------------------------------------------------------------------------*/
1682 
1683 float64 float32_to_float64( float32 a STATUS_PARAM )
1684 {
1685     flag aSign;
1686     int_fast16_t aExp;
1687     uint32_t aSig;
1688     a = float32_squash_input_denormal(a STATUS_VAR);
1689 
1690     aSig = extractFloat32Frac( a );
1691     aExp = extractFloat32Exp( a );
1692     aSign = extractFloat32Sign( a );
1693     if ( aExp == 0xFF ) {
1694         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1695         return packFloat64( aSign, 0x7FF, 0 );
1696     }
1697     if ( aExp == 0 ) {
1698         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1699         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1700         --aExp;
1701     }
1702     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1703 
1704 }
1705 
1706 /*----------------------------------------------------------------------------
1707 | Returns the result of converting the single-precision floating-point value
1708 | `a' to the extended double-precision floating-point format.  The conversion
1709 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1710 | Arithmetic.
1711 *----------------------------------------------------------------------------*/
1712 
1713 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1714 {
1715     flag aSign;
1716     int_fast16_t aExp;
1717     uint32_t aSig;
1718 
1719     a = float32_squash_input_denormal(a STATUS_VAR);
1720     aSig = extractFloat32Frac( a );
1721     aExp = extractFloat32Exp( a );
1722     aSign = extractFloat32Sign( a );
1723     if ( aExp == 0xFF ) {
1724         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1725         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1726     }
1727     if ( aExp == 0 ) {
1728         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1729         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1730     }
1731     aSig |= 0x00800000;
1732     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1733 
1734 }
1735 
1736 /*----------------------------------------------------------------------------
1737 | Returns the result of converting the single-precision floating-point value
1738 | `a' to the double-precision floating-point format.  The conversion is
1739 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1740 | Arithmetic.
1741 *----------------------------------------------------------------------------*/
1742 
1743 float128 float32_to_float128( float32 a STATUS_PARAM )
1744 {
1745     flag aSign;
1746     int_fast16_t aExp;
1747     uint32_t aSig;
1748 
1749     a = float32_squash_input_denormal(a STATUS_VAR);
1750     aSig = extractFloat32Frac( a );
1751     aExp = extractFloat32Exp( a );
1752     aSign = extractFloat32Sign( a );
1753     if ( aExp == 0xFF ) {
1754         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1755         return packFloat128( aSign, 0x7FFF, 0, 0 );
1756     }
1757     if ( aExp == 0 ) {
1758         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1759         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1760         --aExp;
1761     }
1762     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1763 
1764 }
1765 
1766 /*----------------------------------------------------------------------------
1767 | Rounds the single-precision floating-point value `a' to an integer, and
1768 | returns the result as a single-precision floating-point value.  The
1769 | operation is performed according to the IEC/IEEE Standard for Binary
1770 | Floating-Point Arithmetic.
1771 *----------------------------------------------------------------------------*/
1772 
1773 float32 float32_round_to_int( float32 a STATUS_PARAM)
1774 {
1775     flag aSign;
1776     int_fast16_t aExp;
1777     uint32_t lastBitMask, roundBitsMask;
1778     uint32_t z;
1779     a = float32_squash_input_denormal(a STATUS_VAR);
1780 
1781     aExp = extractFloat32Exp( a );
1782     if ( 0x96 <= aExp ) {
1783         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1784             return propagateFloat32NaN( a, a STATUS_VAR );
1785         }
1786         return a;
1787     }
1788     if ( aExp <= 0x7E ) {
1789         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1790         STATUS(float_exception_flags) |= float_flag_inexact;
1791         aSign = extractFloat32Sign( a );
1792         switch ( STATUS(float_rounding_mode) ) {
1793          case float_round_nearest_even:
1794             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1795                 return packFloat32( aSign, 0x7F, 0 );
1796             }
1797             break;
1798         case float_round_ties_away:
1799             if (aExp == 0x7E) {
1800                 return packFloat32(aSign, 0x7F, 0);
1801             }
1802             break;
1803          case float_round_down:
1804             return make_float32(aSign ? 0xBF800000 : 0);
1805          case float_round_up:
1806             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1807         }
1808         return packFloat32( aSign, 0, 0 );
1809     }
1810     lastBitMask = 1;
1811     lastBitMask <<= 0x96 - aExp;
1812     roundBitsMask = lastBitMask - 1;
1813     z = float32_val(a);
1814     switch (STATUS(float_rounding_mode)) {
1815     case float_round_nearest_even:
1816         z += lastBitMask>>1;
1817         if ((z & roundBitsMask) == 0) {
1818             z &= ~lastBitMask;
1819         }
1820         break;
1821     case float_round_ties_away:
1822         z += lastBitMask >> 1;
1823         break;
1824     case float_round_to_zero:
1825         break;
1826     case float_round_up:
1827         if (!extractFloat32Sign(make_float32(z))) {
1828             z += roundBitsMask;
1829         }
1830         break;
1831     case float_round_down:
1832         if (extractFloat32Sign(make_float32(z))) {
1833             z += roundBitsMask;
1834         }
1835         break;
1836     default:
1837         abort();
1838     }
1839     z &= ~ roundBitsMask;
1840     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1841     return make_float32(z);
1842 
1843 }
1844 
1845 /*----------------------------------------------------------------------------
1846 | Returns the result of adding the absolute values of the single-precision
1847 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1848 | before being returned.  `zSign' is ignored if the result is a NaN.
1849 | The addition is performed according to the IEC/IEEE Standard for Binary
1850 | Floating-Point Arithmetic.
1851 *----------------------------------------------------------------------------*/
1852 
1853 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1854 {
1855     int_fast16_t aExp, bExp, zExp;
1856     uint32_t aSig, bSig, zSig;
1857     int_fast16_t expDiff;
1858 
1859     aSig = extractFloat32Frac( a );
1860     aExp = extractFloat32Exp( a );
1861     bSig = extractFloat32Frac( b );
1862     bExp = extractFloat32Exp( b );
1863     expDiff = aExp - bExp;
1864     aSig <<= 6;
1865     bSig <<= 6;
1866     if ( 0 < expDiff ) {
1867         if ( aExp == 0xFF ) {
1868             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1869             return a;
1870         }
1871         if ( bExp == 0 ) {
1872             --expDiff;
1873         }
1874         else {
1875             bSig |= 0x20000000;
1876         }
1877         shift32RightJamming( bSig, expDiff, &bSig );
1878         zExp = aExp;
1879     }
1880     else if ( expDiff < 0 ) {
1881         if ( bExp == 0xFF ) {
1882             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1883             return packFloat32( zSign, 0xFF, 0 );
1884         }
1885         if ( aExp == 0 ) {
1886             ++expDiff;
1887         }
1888         else {
1889             aSig |= 0x20000000;
1890         }
1891         shift32RightJamming( aSig, - expDiff, &aSig );
1892         zExp = bExp;
1893     }
1894     else {
1895         if ( aExp == 0xFF ) {
1896             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1897             return a;
1898         }
1899         if ( aExp == 0 ) {
1900             if (STATUS(flush_to_zero)) {
1901                 if (aSig | bSig) {
1902                     float_raise(float_flag_output_denormal STATUS_VAR);
1903                 }
1904                 return packFloat32(zSign, 0, 0);
1905             }
1906             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1907         }
1908         zSig = 0x40000000 + aSig + bSig;
1909         zExp = aExp;
1910         goto roundAndPack;
1911     }
1912     aSig |= 0x20000000;
1913     zSig = ( aSig + bSig )<<1;
1914     --zExp;
1915     if ( (int32_t) zSig < 0 ) {
1916         zSig = aSig + bSig;
1917         ++zExp;
1918     }
1919  roundAndPack:
1920     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1921 
1922 }
1923 
1924 /*----------------------------------------------------------------------------
1925 | Returns the result of subtracting the absolute values of the single-
1926 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1927 | difference is negated before being returned.  `zSign' is ignored if the
1928 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1929 | Standard for Binary Floating-Point Arithmetic.
1930 *----------------------------------------------------------------------------*/
1931 
1932 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1933 {
1934     int_fast16_t aExp, bExp, zExp;
1935     uint32_t aSig, bSig, zSig;
1936     int_fast16_t expDiff;
1937 
1938     aSig = extractFloat32Frac( a );
1939     aExp = extractFloat32Exp( a );
1940     bSig = extractFloat32Frac( b );
1941     bExp = extractFloat32Exp( b );
1942     expDiff = aExp - bExp;
1943     aSig <<= 7;
1944     bSig <<= 7;
1945     if ( 0 < expDiff ) goto aExpBigger;
1946     if ( expDiff < 0 ) goto bExpBigger;
1947     if ( aExp == 0xFF ) {
1948         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1949         float_raise( float_flag_invalid STATUS_VAR);
1950         return float32_default_nan;
1951     }
1952     if ( aExp == 0 ) {
1953         aExp = 1;
1954         bExp = 1;
1955     }
1956     if ( bSig < aSig ) goto aBigger;
1957     if ( aSig < bSig ) goto bBigger;
1958     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1959  bExpBigger:
1960     if ( bExp == 0xFF ) {
1961         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1962         return packFloat32( zSign ^ 1, 0xFF, 0 );
1963     }
1964     if ( aExp == 0 ) {
1965         ++expDiff;
1966     }
1967     else {
1968         aSig |= 0x40000000;
1969     }
1970     shift32RightJamming( aSig, - expDiff, &aSig );
1971     bSig |= 0x40000000;
1972  bBigger:
1973     zSig = bSig - aSig;
1974     zExp = bExp;
1975     zSign ^= 1;
1976     goto normalizeRoundAndPack;
1977  aExpBigger:
1978     if ( aExp == 0xFF ) {
1979         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1980         return a;
1981     }
1982     if ( bExp == 0 ) {
1983         --expDiff;
1984     }
1985     else {
1986         bSig |= 0x40000000;
1987     }
1988     shift32RightJamming( bSig, expDiff, &bSig );
1989     aSig |= 0x40000000;
1990  aBigger:
1991     zSig = aSig - bSig;
1992     zExp = aExp;
1993  normalizeRoundAndPack:
1994     --zExp;
1995     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996 
1997 }
1998 
1999 /*----------------------------------------------------------------------------
2000 | Returns the result of adding the single-precision floating-point values `a'
2001 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2002 | Binary Floating-Point Arithmetic.
2003 *----------------------------------------------------------------------------*/
2004 
2005 float32 float32_add( float32 a, float32 b STATUS_PARAM )
2006 {
2007     flag aSign, bSign;
2008     a = float32_squash_input_denormal(a STATUS_VAR);
2009     b = float32_squash_input_denormal(b STATUS_VAR);
2010 
2011     aSign = extractFloat32Sign( a );
2012     bSign = extractFloat32Sign( b );
2013     if ( aSign == bSign ) {
2014         return addFloat32Sigs( a, b, aSign STATUS_VAR);
2015     }
2016     else {
2017         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2018     }
2019 
2020 }
2021 
2022 /*----------------------------------------------------------------------------
2023 | Returns the result of subtracting the single-precision floating-point values
2024 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2025 | for Binary Floating-Point Arithmetic.
2026 *----------------------------------------------------------------------------*/
2027 
2028 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2029 {
2030     flag aSign, bSign;
2031     a = float32_squash_input_denormal(a STATUS_VAR);
2032     b = float32_squash_input_denormal(b STATUS_VAR);
2033 
2034     aSign = extractFloat32Sign( a );
2035     bSign = extractFloat32Sign( b );
2036     if ( aSign == bSign ) {
2037         return subFloat32Sigs( a, b, aSign STATUS_VAR );
2038     }
2039     else {
2040         return addFloat32Sigs( a, b, aSign STATUS_VAR );
2041     }
2042 
2043 }
2044 
2045 /*----------------------------------------------------------------------------
2046 | Returns the result of multiplying the single-precision floating-point values
2047 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2048 | for Binary Floating-Point Arithmetic.
2049 *----------------------------------------------------------------------------*/
2050 
2051 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2052 {
2053     flag aSign, bSign, zSign;
2054     int_fast16_t aExp, bExp, zExp;
2055     uint32_t aSig, bSig;
2056     uint64_t zSig64;
2057     uint32_t zSig;
2058 
2059     a = float32_squash_input_denormal(a STATUS_VAR);
2060     b = float32_squash_input_denormal(b STATUS_VAR);
2061 
2062     aSig = extractFloat32Frac( a );
2063     aExp = extractFloat32Exp( a );
2064     aSign = extractFloat32Sign( a );
2065     bSig = extractFloat32Frac( b );
2066     bExp = extractFloat32Exp( b );
2067     bSign = extractFloat32Sign( b );
2068     zSign = aSign ^ bSign;
2069     if ( aExp == 0xFF ) {
2070         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2071             return propagateFloat32NaN( a, b STATUS_VAR );
2072         }
2073         if ( ( bExp | bSig ) == 0 ) {
2074             float_raise( float_flag_invalid STATUS_VAR);
2075             return float32_default_nan;
2076         }
2077         return packFloat32( zSign, 0xFF, 0 );
2078     }
2079     if ( bExp == 0xFF ) {
2080         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2081         if ( ( aExp | aSig ) == 0 ) {
2082             float_raise( float_flag_invalid STATUS_VAR);
2083             return float32_default_nan;
2084         }
2085         return packFloat32( zSign, 0xFF, 0 );
2086     }
2087     if ( aExp == 0 ) {
2088         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2089         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2090     }
2091     if ( bExp == 0 ) {
2092         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2093         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2094     }
2095     zExp = aExp + bExp - 0x7F;
2096     aSig = ( aSig | 0x00800000 )<<7;
2097     bSig = ( bSig | 0x00800000 )<<8;
2098     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2099     zSig = zSig64;
2100     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2101         zSig <<= 1;
2102         --zExp;
2103     }
2104     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2105 
2106 }
2107 
2108 /*----------------------------------------------------------------------------
2109 | Returns the result of dividing the single-precision floating-point value `a'
2110 | by the corresponding value `b'.  The operation is performed according to the
2111 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2112 *----------------------------------------------------------------------------*/
2113 
2114 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2115 {
2116     flag aSign, bSign, zSign;
2117     int_fast16_t aExp, bExp, zExp;
2118     uint32_t aSig, bSig, zSig;
2119     a = float32_squash_input_denormal(a STATUS_VAR);
2120     b = float32_squash_input_denormal(b STATUS_VAR);
2121 
2122     aSig = extractFloat32Frac( a );
2123     aExp = extractFloat32Exp( a );
2124     aSign = extractFloat32Sign( a );
2125     bSig = extractFloat32Frac( b );
2126     bExp = extractFloat32Exp( b );
2127     bSign = extractFloat32Sign( b );
2128     zSign = aSign ^ bSign;
2129     if ( aExp == 0xFF ) {
2130         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2131         if ( bExp == 0xFF ) {
2132             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2133             float_raise( float_flag_invalid STATUS_VAR);
2134             return float32_default_nan;
2135         }
2136         return packFloat32( zSign, 0xFF, 0 );
2137     }
2138     if ( bExp == 0xFF ) {
2139         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2140         return packFloat32( zSign, 0, 0 );
2141     }
2142     if ( bExp == 0 ) {
2143         if ( bSig == 0 ) {
2144             if ( ( aExp | aSig ) == 0 ) {
2145                 float_raise( float_flag_invalid STATUS_VAR);
2146                 return float32_default_nan;
2147             }
2148             float_raise( float_flag_divbyzero STATUS_VAR);
2149             return packFloat32( zSign, 0xFF, 0 );
2150         }
2151         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2152     }
2153     if ( aExp == 0 ) {
2154         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2155         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2156     }
2157     zExp = aExp - bExp + 0x7D;
2158     aSig = ( aSig | 0x00800000 )<<7;
2159     bSig = ( bSig | 0x00800000 )<<8;
2160     if ( bSig <= ( aSig + aSig ) ) {
2161         aSig >>= 1;
2162         ++zExp;
2163     }
2164     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2165     if ( ( zSig & 0x3F ) == 0 ) {
2166         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2167     }
2168     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2169 
2170 }
2171 
2172 /*----------------------------------------------------------------------------
2173 | Returns the remainder of the single-precision floating-point value `a'
2174 | with respect to the corresponding value `b'.  The operation is performed
2175 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2176 *----------------------------------------------------------------------------*/
2177 
2178 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2179 {
2180     flag aSign, zSign;
2181     int_fast16_t aExp, bExp, expDiff;
2182     uint32_t aSig, bSig;
2183     uint32_t q;
2184     uint64_t aSig64, bSig64, q64;
2185     uint32_t alternateASig;
2186     int32_t sigMean;
2187     a = float32_squash_input_denormal(a STATUS_VAR);
2188     b = float32_squash_input_denormal(b STATUS_VAR);
2189 
2190     aSig = extractFloat32Frac( a );
2191     aExp = extractFloat32Exp( a );
2192     aSign = extractFloat32Sign( a );
2193     bSig = extractFloat32Frac( b );
2194     bExp = extractFloat32Exp( b );
2195     if ( aExp == 0xFF ) {
2196         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2197             return propagateFloat32NaN( a, b STATUS_VAR );
2198         }
2199         float_raise( float_flag_invalid STATUS_VAR);
2200         return float32_default_nan;
2201     }
2202     if ( bExp == 0xFF ) {
2203         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2204         return a;
2205     }
2206     if ( bExp == 0 ) {
2207         if ( bSig == 0 ) {
2208             float_raise( float_flag_invalid STATUS_VAR);
2209             return float32_default_nan;
2210         }
2211         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2212     }
2213     if ( aExp == 0 ) {
2214         if ( aSig == 0 ) return a;
2215         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2216     }
2217     expDiff = aExp - bExp;
2218     aSig |= 0x00800000;
2219     bSig |= 0x00800000;
2220     if ( expDiff < 32 ) {
2221         aSig <<= 8;
2222         bSig <<= 8;
2223         if ( expDiff < 0 ) {
2224             if ( expDiff < -1 ) return a;
2225             aSig >>= 1;
2226         }
2227         q = ( bSig <= aSig );
2228         if ( q ) aSig -= bSig;
2229         if ( 0 < expDiff ) {
2230             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2231             q >>= 32 - expDiff;
2232             bSig >>= 2;
2233             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2234         }
2235         else {
2236             aSig >>= 2;
2237             bSig >>= 2;
2238         }
2239     }
2240     else {
2241         if ( bSig <= aSig ) aSig -= bSig;
2242         aSig64 = ( (uint64_t) aSig )<<40;
2243         bSig64 = ( (uint64_t) bSig )<<40;
2244         expDiff -= 64;
2245         while ( 0 < expDiff ) {
2246             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2247             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2248             aSig64 = - ( ( bSig * q64 )<<38 );
2249             expDiff -= 62;
2250         }
2251         expDiff += 64;
2252         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2253         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2254         q = q64>>( 64 - expDiff );
2255         bSig <<= 6;
2256         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2257     }
2258     do {
2259         alternateASig = aSig;
2260         ++q;
2261         aSig -= bSig;
2262     } while ( 0 <= (int32_t) aSig );
2263     sigMean = aSig + alternateASig;
2264     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2265         aSig = alternateASig;
2266     }
2267     zSign = ( (int32_t) aSig < 0 );
2268     if ( zSign ) aSig = - aSig;
2269     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2270 
2271 }
2272 
2273 /*----------------------------------------------------------------------------
2274 | Returns the result of multiplying the single-precision floating-point values
2275 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2276 | multiplication.  The operation is performed according to the IEC/IEEE
2277 | Standard for Binary Floating-Point Arithmetic 754-2008.
2278 | The flags argument allows the caller to select negation of the
2279 | addend, the intermediate product, or the final result. (The difference
2280 | between this and having the caller do a separate negation is that negating
2281 | externally will flip the sign bit on NaNs.)
2282 *----------------------------------------------------------------------------*/
2283 
2284 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2285 {
2286     flag aSign, bSign, cSign, zSign;
2287     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2288     uint32_t aSig, bSig, cSig;
2289     flag pInf, pZero, pSign;
2290     uint64_t pSig64, cSig64, zSig64;
2291     uint32_t pSig;
2292     int shiftcount;
2293     flag signflip, infzero;
2294 
2295     a = float32_squash_input_denormal(a STATUS_VAR);
2296     b = float32_squash_input_denormal(b STATUS_VAR);
2297     c = float32_squash_input_denormal(c STATUS_VAR);
2298     aSig = extractFloat32Frac(a);
2299     aExp = extractFloat32Exp(a);
2300     aSign = extractFloat32Sign(a);
2301     bSig = extractFloat32Frac(b);
2302     bExp = extractFloat32Exp(b);
2303     bSign = extractFloat32Sign(b);
2304     cSig = extractFloat32Frac(c);
2305     cExp = extractFloat32Exp(c);
2306     cSign = extractFloat32Sign(c);
2307 
2308     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2309                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2310 
2311     /* It is implementation-defined whether the cases of (0,inf,qnan)
2312      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2313      * they return if they do), so we have to hand this information
2314      * off to the target-specific pick-a-NaN routine.
2315      */
2316     if (((aExp == 0xff) && aSig) ||
2317         ((bExp == 0xff) && bSig) ||
2318         ((cExp == 0xff) && cSig)) {
2319         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2320     }
2321 
2322     if (infzero) {
2323         float_raise(float_flag_invalid STATUS_VAR);
2324         return float32_default_nan;
2325     }
2326 
2327     if (flags & float_muladd_negate_c) {
2328         cSign ^= 1;
2329     }
2330 
2331     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2332 
2333     /* Work out the sign and type of the product */
2334     pSign = aSign ^ bSign;
2335     if (flags & float_muladd_negate_product) {
2336         pSign ^= 1;
2337     }
2338     pInf = (aExp == 0xff) || (bExp == 0xff);
2339     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2340 
2341     if (cExp == 0xff) {
2342         if (pInf && (pSign ^ cSign)) {
2343             /* addition of opposite-signed infinities => InvalidOperation */
2344             float_raise(float_flag_invalid STATUS_VAR);
2345             return float32_default_nan;
2346         }
2347         /* Otherwise generate an infinity of the same sign */
2348         return packFloat32(cSign ^ signflip, 0xff, 0);
2349     }
2350 
2351     if (pInf) {
2352         return packFloat32(pSign ^ signflip, 0xff, 0);
2353     }
2354 
2355     if (pZero) {
2356         if (cExp == 0) {
2357             if (cSig == 0) {
2358                 /* Adding two exact zeroes */
2359                 if (pSign == cSign) {
2360                     zSign = pSign;
2361                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2362                     zSign = 1;
2363                 } else {
2364                     zSign = 0;
2365                 }
2366                 return packFloat32(zSign ^ signflip, 0, 0);
2367             }
2368             /* Exact zero plus a denorm */
2369             if (STATUS(flush_to_zero)) {
2370                 float_raise(float_flag_output_denormal STATUS_VAR);
2371                 return packFloat32(cSign ^ signflip, 0, 0);
2372             }
2373         }
2374         /* Zero plus something non-zero : just return the something */
2375         return packFloat32(cSign ^ signflip, cExp, cSig);
2376     }
2377 
2378     if (aExp == 0) {
2379         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2380     }
2381     if (bExp == 0) {
2382         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2383     }
2384 
2385     /* Calculate the actual result a * b + c */
2386 
2387     /* Multiply first; this is easy. */
2388     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2389      * because we want the true exponent, not the "one-less-than"
2390      * flavour that roundAndPackFloat32() takes.
2391      */
2392     pExp = aExp + bExp - 0x7e;
2393     aSig = (aSig | 0x00800000) << 7;
2394     bSig = (bSig | 0x00800000) << 8;
2395     pSig64 = (uint64_t)aSig * bSig;
2396     if ((int64_t)(pSig64 << 1) >= 0) {
2397         pSig64 <<= 1;
2398         pExp--;
2399     }
2400 
2401     zSign = pSign ^ signflip;
2402 
2403     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2404      * position 62.
2405      */
2406     if (cExp == 0) {
2407         if (!cSig) {
2408             /* Throw out the special case of c being an exact zero now */
2409             shift64RightJamming(pSig64, 32, &pSig64);
2410             pSig = pSig64;
2411             return roundAndPackFloat32(zSign, pExp - 1,
2412                                        pSig STATUS_VAR);
2413         }
2414         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2415     }
2416 
2417     cSig64 = (uint64_t)cSig << (62 - 23);
2418     cSig64 |= LIT64(0x4000000000000000);
2419     expDiff = pExp - cExp;
2420 
2421     if (pSign == cSign) {
2422         /* Addition */
2423         if (expDiff > 0) {
2424             /* scale c to match p */
2425             shift64RightJamming(cSig64, expDiff, &cSig64);
2426             zExp = pExp;
2427         } else if (expDiff < 0) {
2428             /* scale p to match c */
2429             shift64RightJamming(pSig64, -expDiff, &pSig64);
2430             zExp = cExp;
2431         } else {
2432             /* no scaling needed */
2433             zExp = cExp;
2434         }
2435         /* Add significands and make sure explicit bit ends up in posn 62 */
2436         zSig64 = pSig64 + cSig64;
2437         if ((int64_t)zSig64 < 0) {
2438             shift64RightJamming(zSig64, 1, &zSig64);
2439         } else {
2440             zExp--;
2441         }
2442     } else {
2443         /* Subtraction */
2444         if (expDiff > 0) {
2445             shift64RightJamming(cSig64, expDiff, &cSig64);
2446             zSig64 = pSig64 - cSig64;
2447             zExp = pExp;
2448         } else if (expDiff < 0) {
2449             shift64RightJamming(pSig64, -expDiff, &pSig64);
2450             zSig64 = cSig64 - pSig64;
2451             zExp = cExp;
2452             zSign ^= 1;
2453         } else {
2454             zExp = pExp;
2455             if (cSig64 < pSig64) {
2456                 zSig64 = pSig64 - cSig64;
2457             } else if (pSig64 < cSig64) {
2458                 zSig64 = cSig64 - pSig64;
2459                 zSign ^= 1;
2460             } else {
2461                 /* Exact zero */
2462                 zSign = signflip;
2463                 if (STATUS(float_rounding_mode) == float_round_down) {
2464                     zSign ^= 1;
2465                 }
2466                 return packFloat32(zSign, 0, 0);
2467             }
2468         }
2469         --zExp;
2470         /* Normalize to put the explicit bit back into bit 62. */
2471         shiftcount = countLeadingZeros64(zSig64) - 1;
2472         zSig64 <<= shiftcount;
2473         zExp -= shiftcount;
2474     }
2475     shift64RightJamming(zSig64, 32, &zSig64);
2476     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2477 }
2478 
2479 
2480 /*----------------------------------------------------------------------------
2481 | Returns the square root of the single-precision floating-point value `a'.
2482 | The operation is performed according to the IEC/IEEE Standard for Binary
2483 | Floating-Point Arithmetic.
2484 *----------------------------------------------------------------------------*/
2485 
2486 float32 float32_sqrt( float32 a STATUS_PARAM )
2487 {
2488     flag aSign;
2489     int_fast16_t aExp, zExp;
2490     uint32_t aSig, zSig;
2491     uint64_t rem, term;
2492     a = float32_squash_input_denormal(a STATUS_VAR);
2493 
2494     aSig = extractFloat32Frac( a );
2495     aExp = extractFloat32Exp( a );
2496     aSign = extractFloat32Sign( a );
2497     if ( aExp == 0xFF ) {
2498         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2499         if ( ! aSign ) return a;
2500         float_raise( float_flag_invalid STATUS_VAR);
2501         return float32_default_nan;
2502     }
2503     if ( aSign ) {
2504         if ( ( aExp | aSig ) == 0 ) return a;
2505         float_raise( float_flag_invalid STATUS_VAR);
2506         return float32_default_nan;
2507     }
2508     if ( aExp == 0 ) {
2509         if ( aSig == 0 ) return float32_zero;
2510         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2511     }
2512     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2513     aSig = ( aSig | 0x00800000 )<<8;
2514     zSig = estimateSqrt32( aExp, aSig ) + 2;
2515     if ( ( zSig & 0x7F ) <= 5 ) {
2516         if ( zSig < 2 ) {
2517             zSig = 0x7FFFFFFF;
2518             goto roundAndPack;
2519         }
2520         aSig >>= aExp & 1;
2521         term = ( (uint64_t) zSig ) * zSig;
2522         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2523         while ( (int64_t) rem < 0 ) {
2524             --zSig;
2525             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2526         }
2527         zSig |= ( rem != 0 );
2528     }
2529     shift32RightJamming( zSig, 1, &zSig );
2530  roundAndPack:
2531     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2532 
2533 }
2534 
2535 /*----------------------------------------------------------------------------
2536 | Returns the binary exponential of the single-precision floating-point value
2537 | `a'. The operation is performed according to the IEC/IEEE Standard for
2538 | Binary Floating-Point Arithmetic.
2539 |
2540 | Uses the following identities:
2541 |
2542 | 1. -------------------------------------------------------------------------
2543 |      x    x*ln(2)
2544 |     2  = e
2545 |
2546 | 2. -------------------------------------------------------------------------
2547 |                      2     3     4     5           n
2548 |      x        x     x     x     x     x           x
2549 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2550 |               1!    2!    3!    4!    5!          n!
2551 *----------------------------------------------------------------------------*/
2552 
2553 static const float64 float32_exp2_coefficients[15] =
2554 {
2555     const_float64( 0x3ff0000000000000ll ), /*  1 */
2556     const_float64( 0x3fe0000000000000ll ), /*  2 */
2557     const_float64( 0x3fc5555555555555ll ), /*  3 */
2558     const_float64( 0x3fa5555555555555ll ), /*  4 */
2559     const_float64( 0x3f81111111111111ll ), /*  5 */
2560     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2561     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2562     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2563     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2564     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2565     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2566     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2567     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2568     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2569     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2570 };
2571 
2572 float32 float32_exp2( float32 a STATUS_PARAM )
2573 {
2574     flag aSign;
2575     int_fast16_t aExp;
2576     uint32_t aSig;
2577     float64 r, x, xn;
2578     int i;
2579     a = float32_squash_input_denormal(a STATUS_VAR);
2580 
2581     aSig = extractFloat32Frac( a );
2582     aExp = extractFloat32Exp( a );
2583     aSign = extractFloat32Sign( a );
2584 
2585     if ( aExp == 0xFF) {
2586         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2587         return (aSign) ? float32_zero : a;
2588     }
2589     if (aExp == 0) {
2590         if (aSig == 0) return float32_one;
2591     }
2592 
2593     float_raise( float_flag_inexact STATUS_VAR);
2594 
2595     /* ******************************* */
2596     /* using float64 for approximation */
2597     /* ******************************* */
2598     x = float32_to_float64(a STATUS_VAR);
2599     x = float64_mul(x, float64_ln2 STATUS_VAR);
2600 
2601     xn = x;
2602     r = float64_one;
2603     for (i = 0 ; i < 15 ; i++) {
2604         float64 f;
2605 
2606         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2607         r = float64_add(r, f STATUS_VAR);
2608 
2609         xn = float64_mul(xn, x STATUS_VAR);
2610     }
2611 
2612     return float64_to_float32(r, status);
2613 }
2614 
2615 /*----------------------------------------------------------------------------
2616 | Returns the binary log of the single-precision floating-point value `a'.
2617 | The operation is performed according to the IEC/IEEE Standard for Binary
2618 | Floating-Point Arithmetic.
2619 *----------------------------------------------------------------------------*/
2620 float32 float32_log2( float32 a STATUS_PARAM )
2621 {
2622     flag aSign, zSign;
2623     int_fast16_t aExp;
2624     uint32_t aSig, zSig, i;
2625 
2626     a = float32_squash_input_denormal(a STATUS_VAR);
2627     aSig = extractFloat32Frac( a );
2628     aExp = extractFloat32Exp( a );
2629     aSign = extractFloat32Sign( a );
2630 
2631     if ( aExp == 0 ) {
2632         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2633         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2634     }
2635     if ( aSign ) {
2636         float_raise( float_flag_invalid STATUS_VAR);
2637         return float32_default_nan;
2638     }
2639     if ( aExp == 0xFF ) {
2640         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2641         return a;
2642     }
2643 
2644     aExp -= 0x7F;
2645     aSig |= 0x00800000;
2646     zSign = aExp < 0;
2647     zSig = aExp << 23;
2648 
2649     for (i = 1 << 22; i > 0; i >>= 1) {
2650         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2651         if ( aSig & 0x01000000 ) {
2652             aSig >>= 1;
2653             zSig |= i;
2654         }
2655     }
2656 
2657     if ( zSign )
2658         zSig = -zSig;
2659 
2660     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2661 }
2662 
2663 /*----------------------------------------------------------------------------
2664 | Returns 1 if the single-precision floating-point value `a' is equal to
2665 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2666 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2667 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2668 *----------------------------------------------------------------------------*/
2669 
2670 int float32_eq( float32 a, float32 b STATUS_PARAM )
2671 {
2672     uint32_t av, bv;
2673     a = float32_squash_input_denormal(a STATUS_VAR);
2674     b = float32_squash_input_denormal(b STATUS_VAR);
2675 
2676     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2677          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2678        ) {
2679         float_raise( float_flag_invalid STATUS_VAR);
2680         return 0;
2681     }
2682     av = float32_val(a);
2683     bv = float32_val(b);
2684     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2685 }
2686 
2687 /*----------------------------------------------------------------------------
2688 | Returns 1 if the single-precision floating-point value `a' is less than
2689 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2690 | exception is raised if either operand is a NaN.  The comparison is performed
2691 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2692 *----------------------------------------------------------------------------*/
2693 
2694 int float32_le( float32 a, float32 b STATUS_PARAM )
2695 {
2696     flag aSign, bSign;
2697     uint32_t av, bv;
2698     a = float32_squash_input_denormal(a STATUS_VAR);
2699     b = float32_squash_input_denormal(b STATUS_VAR);
2700 
2701     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2702          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2703        ) {
2704         float_raise( float_flag_invalid STATUS_VAR);
2705         return 0;
2706     }
2707     aSign = extractFloat32Sign( a );
2708     bSign = extractFloat32Sign( b );
2709     av = float32_val(a);
2710     bv = float32_val(b);
2711     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2712     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2713 
2714 }
2715 
2716 /*----------------------------------------------------------------------------
2717 | Returns 1 if the single-precision floating-point value `a' is less than
2718 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2719 | raised if either operand is a NaN.  The comparison is performed according
2720 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2721 *----------------------------------------------------------------------------*/
2722 
2723 int float32_lt( float32 a, float32 b STATUS_PARAM )
2724 {
2725     flag aSign, bSign;
2726     uint32_t av, bv;
2727     a = float32_squash_input_denormal(a STATUS_VAR);
2728     b = float32_squash_input_denormal(b STATUS_VAR);
2729 
2730     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2731          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2732        ) {
2733         float_raise( float_flag_invalid STATUS_VAR);
2734         return 0;
2735     }
2736     aSign = extractFloat32Sign( a );
2737     bSign = extractFloat32Sign( b );
2738     av = float32_val(a);
2739     bv = float32_val(b);
2740     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2741     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2742 
2743 }
2744 
2745 /*----------------------------------------------------------------------------
2746 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2747 | be compared, and 0 otherwise.  The invalid exception is raised if either
2748 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2749 | Standard for Binary Floating-Point Arithmetic.
2750 *----------------------------------------------------------------------------*/
2751 
2752 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2753 {
2754     a = float32_squash_input_denormal(a STATUS_VAR);
2755     b = float32_squash_input_denormal(b STATUS_VAR);
2756 
2757     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2758          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2759        ) {
2760         float_raise( float_flag_invalid STATUS_VAR);
2761         return 1;
2762     }
2763     return 0;
2764 }
2765 
2766 /*----------------------------------------------------------------------------
2767 | Returns 1 if the single-precision floating-point value `a' is equal to
2768 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2769 | exception.  The comparison is performed according to the IEC/IEEE Standard
2770 | for Binary Floating-Point Arithmetic.
2771 *----------------------------------------------------------------------------*/
2772 
2773 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2774 {
2775     a = float32_squash_input_denormal(a STATUS_VAR);
2776     b = float32_squash_input_denormal(b STATUS_VAR);
2777 
2778     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2780        ) {
2781         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782             float_raise( float_flag_invalid STATUS_VAR);
2783         }
2784         return 0;
2785     }
2786     return ( float32_val(a) == float32_val(b) ) ||
2787             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2788 }
2789 
2790 /*----------------------------------------------------------------------------
2791 | Returns 1 if the single-precision floating-point value `a' is less than or
2792 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2793 | cause an exception.  Otherwise, the comparison is performed according to the
2794 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2795 *----------------------------------------------------------------------------*/
2796 
2797 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2798 {
2799     flag aSign, bSign;
2800     uint32_t av, bv;
2801     a = float32_squash_input_denormal(a STATUS_VAR);
2802     b = float32_squash_input_denormal(b STATUS_VAR);
2803 
2804     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2805          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2806        ) {
2807         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2808             float_raise( float_flag_invalid STATUS_VAR);
2809         }
2810         return 0;
2811     }
2812     aSign = extractFloat32Sign( a );
2813     bSign = extractFloat32Sign( b );
2814     av = float32_val(a);
2815     bv = float32_val(b);
2816     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2817     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2818 
2819 }
2820 
2821 /*----------------------------------------------------------------------------
2822 | Returns 1 if the single-precision floating-point value `a' is less than
2823 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2824 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2825 | Standard for Binary Floating-Point Arithmetic.
2826 *----------------------------------------------------------------------------*/
2827 
2828 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2829 {
2830     flag aSign, bSign;
2831     uint32_t av, bv;
2832     a = float32_squash_input_denormal(a STATUS_VAR);
2833     b = float32_squash_input_denormal(b STATUS_VAR);
2834 
2835     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2836          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2837        ) {
2838         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2839             float_raise( float_flag_invalid STATUS_VAR);
2840         }
2841         return 0;
2842     }
2843     aSign = extractFloat32Sign( a );
2844     bSign = extractFloat32Sign( b );
2845     av = float32_val(a);
2846     bv = float32_val(b);
2847     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2848     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2849 
2850 }
2851 
2852 /*----------------------------------------------------------------------------
2853 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2854 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2855 | comparison is performed according to the IEC/IEEE Standard for Binary
2856 | Floating-Point Arithmetic.
2857 *----------------------------------------------------------------------------*/
2858 
2859 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2860 {
2861     a = float32_squash_input_denormal(a STATUS_VAR);
2862     b = float32_squash_input_denormal(b STATUS_VAR);
2863 
2864     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2865          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2866        ) {
2867         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2868             float_raise( float_flag_invalid STATUS_VAR);
2869         }
2870         return 1;
2871     }
2872     return 0;
2873 }
2874 
2875 /*----------------------------------------------------------------------------
2876 | Returns the result of converting the double-precision floating-point value
2877 | `a' to the 32-bit two's complement integer format.  The conversion is
2878 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2879 | Arithmetic---which means in particular that the conversion is rounded
2880 | according to the current rounding mode.  If `a' is a NaN, the largest
2881 | positive integer is returned.  Otherwise, if the conversion overflows, the
2882 | largest integer with the same sign as `a' is returned.
2883 *----------------------------------------------------------------------------*/
2884 
2885 int32 float64_to_int32( float64 a STATUS_PARAM )
2886 {
2887     flag aSign;
2888     int_fast16_t aExp, shiftCount;
2889     uint64_t aSig;
2890     a = float64_squash_input_denormal(a STATUS_VAR);
2891 
2892     aSig = extractFloat64Frac( a );
2893     aExp = extractFloat64Exp( a );
2894     aSign = extractFloat64Sign( a );
2895     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2896     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2897     shiftCount = 0x42C - aExp;
2898     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2899     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2900 
2901 }
2902 
2903 /*----------------------------------------------------------------------------
2904 | Returns the result of converting the double-precision floating-point value
2905 | `a' to the 32-bit two's complement integer format.  The conversion is
2906 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2907 | Arithmetic, except that the conversion is always rounded toward zero.
2908 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2909 | the conversion overflows, the largest integer with the same sign as `a' is
2910 | returned.
2911 *----------------------------------------------------------------------------*/
2912 
2913 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2914 {
2915     flag aSign;
2916     int_fast16_t aExp, shiftCount;
2917     uint64_t aSig, savedASig;
2918     int32_t z;
2919     a = float64_squash_input_denormal(a STATUS_VAR);
2920 
2921     aSig = extractFloat64Frac( a );
2922     aExp = extractFloat64Exp( a );
2923     aSign = extractFloat64Sign( a );
2924     if ( 0x41E < aExp ) {
2925         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2926         goto invalid;
2927     }
2928     else if ( aExp < 0x3FF ) {
2929         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2930         return 0;
2931     }
2932     aSig |= LIT64( 0x0010000000000000 );
2933     shiftCount = 0x433 - aExp;
2934     savedASig = aSig;
2935     aSig >>= shiftCount;
2936     z = aSig;
2937     if ( aSign ) z = - z;
2938     if ( ( z < 0 ) ^ aSign ) {
2939  invalid:
2940         float_raise( float_flag_invalid STATUS_VAR);
2941         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2942     }
2943     if ( ( aSig<<shiftCount ) != savedASig ) {
2944         STATUS(float_exception_flags) |= float_flag_inexact;
2945     }
2946     return z;
2947 
2948 }
2949 
2950 /*----------------------------------------------------------------------------
2951 | Returns the result of converting the double-precision floating-point value
2952 | `a' to the 16-bit two's complement integer format.  The conversion is
2953 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2954 | Arithmetic, except that the conversion is always rounded toward zero.
2955 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2956 | the conversion overflows, the largest integer with the same sign as `a' is
2957 | returned.
2958 *----------------------------------------------------------------------------*/
2959 
2960 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2961 {
2962     flag aSign;
2963     int_fast16_t aExp, shiftCount;
2964     uint64_t aSig, savedASig;
2965     int32 z;
2966 
2967     aSig = extractFloat64Frac( a );
2968     aExp = extractFloat64Exp( a );
2969     aSign = extractFloat64Sign( a );
2970     if ( 0x40E < aExp ) {
2971         if ( ( aExp == 0x7FF ) && aSig ) {
2972             aSign = 0;
2973         }
2974         goto invalid;
2975     }
2976     else if ( aExp < 0x3FF ) {
2977         if ( aExp || aSig ) {
2978             STATUS(float_exception_flags) |= float_flag_inexact;
2979         }
2980         return 0;
2981     }
2982     aSig |= LIT64( 0x0010000000000000 );
2983     shiftCount = 0x433 - aExp;
2984     savedASig = aSig;
2985     aSig >>= shiftCount;
2986     z = aSig;
2987     if ( aSign ) {
2988         z = - z;
2989     }
2990     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2991  invalid:
2992         float_raise( float_flag_invalid STATUS_VAR);
2993         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2994     }
2995     if ( ( aSig<<shiftCount ) != savedASig ) {
2996         STATUS(float_exception_flags) |= float_flag_inexact;
2997     }
2998     return z;
2999 }
3000 
3001 /*----------------------------------------------------------------------------
3002 | Returns the result of converting the double-precision floating-point value
3003 | `a' to the 64-bit two's complement integer format.  The conversion is
3004 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3005 | Arithmetic---which means in particular that the conversion is rounded
3006 | according to the current rounding mode.  If `a' is a NaN, the largest
3007 | positive integer is returned.  Otherwise, if the conversion overflows, the
3008 | largest integer with the same sign as `a' is returned.
3009 *----------------------------------------------------------------------------*/
3010 
3011 int64 float64_to_int64( float64 a STATUS_PARAM )
3012 {
3013     flag aSign;
3014     int_fast16_t aExp, shiftCount;
3015     uint64_t aSig, aSigExtra;
3016     a = float64_squash_input_denormal(a STATUS_VAR);
3017 
3018     aSig = extractFloat64Frac( a );
3019     aExp = extractFloat64Exp( a );
3020     aSign = extractFloat64Sign( a );
3021     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3022     shiftCount = 0x433 - aExp;
3023     if ( shiftCount <= 0 ) {
3024         if ( 0x43E < aExp ) {
3025             float_raise( float_flag_invalid STATUS_VAR);
3026             if (    ! aSign
3027                  || (    ( aExp == 0x7FF )
3028                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
3029                ) {
3030                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3031             }
3032             return (int64_t) LIT64( 0x8000000000000000 );
3033         }
3034         aSigExtra = 0;
3035         aSig <<= - shiftCount;
3036     }
3037     else {
3038         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3039     }
3040     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3041 
3042 }
3043 
3044 /*----------------------------------------------------------------------------
3045 | Returns the result of converting the double-precision floating-point value
3046 | `a' to the 64-bit two's complement integer format.  The conversion is
3047 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3048 | Arithmetic, except that the conversion is always rounded toward zero.
3049 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
3050 | the conversion overflows, the largest integer with the same sign as `a' is
3051 | returned.
3052 *----------------------------------------------------------------------------*/
3053 
3054 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3055 {
3056     flag aSign;
3057     int_fast16_t aExp, shiftCount;
3058     uint64_t aSig;
3059     int64 z;
3060     a = float64_squash_input_denormal(a STATUS_VAR);
3061 
3062     aSig = extractFloat64Frac( a );
3063     aExp = extractFloat64Exp( a );
3064     aSign = extractFloat64Sign( a );
3065     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3066     shiftCount = aExp - 0x433;
3067     if ( 0 <= shiftCount ) {
3068         if ( 0x43E <= aExp ) {
3069             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3070                 float_raise( float_flag_invalid STATUS_VAR);
3071                 if (    ! aSign
3072                      || (    ( aExp == 0x7FF )
3073                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3074                    ) {
3075                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3076                 }
3077             }
3078             return (int64_t) LIT64( 0x8000000000000000 );
3079         }
3080         z = aSig<<shiftCount;
3081     }
3082     else {
3083         if ( aExp < 0x3FE ) {
3084             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3085             return 0;
3086         }
3087         z = aSig>>( - shiftCount );
3088         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3089             STATUS(float_exception_flags) |= float_flag_inexact;
3090         }
3091     }
3092     if ( aSign ) z = - z;
3093     return z;
3094 
3095 }
3096 
3097 /*----------------------------------------------------------------------------
3098 | Returns the result of converting the double-precision floating-point value
3099 | `a' to the single-precision floating-point format.  The conversion is
3100 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3101 | Arithmetic.
3102 *----------------------------------------------------------------------------*/
3103 
3104 float32 float64_to_float32( float64 a STATUS_PARAM )
3105 {
3106     flag aSign;
3107     int_fast16_t aExp;
3108     uint64_t aSig;
3109     uint32_t zSig;
3110     a = float64_squash_input_denormal(a STATUS_VAR);
3111 
3112     aSig = extractFloat64Frac( a );
3113     aExp = extractFloat64Exp( a );
3114     aSign = extractFloat64Sign( a );
3115     if ( aExp == 0x7FF ) {
3116         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3117         return packFloat32( aSign, 0xFF, 0 );
3118     }
3119     shift64RightJamming( aSig, 22, &aSig );
3120     zSig = aSig;
3121     if ( aExp || zSig ) {
3122         zSig |= 0x40000000;
3123         aExp -= 0x381;
3124     }
3125     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3126 
3127 }
3128 
3129 
3130 /*----------------------------------------------------------------------------
3131 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3132 | half-precision floating-point value, returning the result.  After being
3133 | shifted into the proper positions, the three fields are simply added
3134 | together to form the result.  This means that any integer portion of `zSig'
3135 | will be added into the exponent.  Since a properly normalized significand
3136 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3137 | than the desired result exponent whenever `zSig' is a complete, normalized
3138 | significand.
3139 *----------------------------------------------------------------------------*/
3140 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3141 {
3142     return make_float16(
3143         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3144 }
3145 
3146 /*----------------------------------------------------------------------------
3147 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3148 | and significand `zSig', and returns the proper half-precision floating-
3149 | point value corresponding to the abstract input.  Ordinarily, the abstract
3150 | value is simply rounded and packed into the half-precision format, with
3151 | the inexact exception raised if the abstract input cannot be represented
3152 | exactly.  However, if the abstract value is too large, the overflow and
3153 | inexact exceptions are raised and an infinity or maximal finite value is
3154 | returned.  If the abstract value is too small, the input value is rounded to
3155 | a subnormal number, and the underflow and inexact exceptions are raised if
3156 | the abstract input cannot be represented exactly as a subnormal half-
3157 | precision floating-point number.
3158 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3159 | ARM-style "alternative representation", which omits the NaN and Inf
3160 | encodings in order to raise the maximum representable exponent by one.
3161 |     The input significand `zSig' has its binary point between bits 22
3162 | and 23, which is 13 bits to the left of the usual location.  This shifted
3163 | significand must be normalized or smaller.  If `zSig' is not normalized,
3164 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3165 | and it must not require rounding.  In the usual case that `zSig' is
3166 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3167 | Note the slightly odd position of the binary point in zSig compared with the
3168 | other roundAndPackFloat functions. This should probably be fixed if we
3169 | need to implement more float16 routines than just conversion.
3170 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3171 | Binary Floating-Point Arithmetic.
3172 *----------------------------------------------------------------------------*/
3173 
3174 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3175                                    uint32_t zSig, flag ieee STATUS_PARAM)
3176 {
3177     int maxexp = ieee ? 29 : 30;
3178     uint32_t mask;
3179     uint32_t increment;
3180     bool rounding_bumps_exp;
3181     bool is_tiny = false;
3182 
3183     /* Calculate the mask of bits of the mantissa which are not
3184      * representable in half-precision and will be lost.
3185      */
3186     if (zExp < 1) {
3187         /* Will be denormal in halfprec */
3188         mask = 0x00ffffff;
3189         if (zExp >= -11) {
3190             mask >>= 11 + zExp;
3191         }
3192     } else {
3193         /* Normal number in halfprec */
3194         mask = 0x00001fff;
3195     }
3196 
3197     switch (STATUS(float_rounding_mode)) {
3198     case float_round_nearest_even:
3199         increment = (mask + 1) >> 1;
3200         if ((zSig & mask) == increment) {
3201             increment = zSig & (increment << 1);
3202         }
3203         break;
3204     case float_round_ties_away:
3205         increment = (mask + 1) >> 1;
3206         break;
3207     case float_round_up:
3208         increment = zSign ? 0 : mask;
3209         break;
3210     case float_round_down:
3211         increment = zSign ? mask : 0;
3212         break;
3213     default: /* round_to_zero */
3214         increment = 0;
3215         break;
3216     }
3217 
3218     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3219 
3220     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3221         if (ieee) {
3222             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3223             return packFloat16(zSign, 0x1f, 0);
3224         } else {
3225             float_raise(float_flag_invalid STATUS_VAR);
3226             return packFloat16(zSign, 0x1f, 0x3ff);
3227         }
3228     }
3229 
3230     if (zExp < 0) {
3231         /* Note that flush-to-zero does not affect half-precision results */
3232         is_tiny =
3233             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3234             || (zExp < -1)
3235             || (!rounding_bumps_exp);
3236     }
3237     if (zSig & mask) {
3238         float_raise(float_flag_inexact STATUS_VAR);
3239         if (is_tiny) {
3240             float_raise(float_flag_underflow STATUS_VAR);
3241         }
3242     }
3243 
3244     zSig += increment;
3245     if (rounding_bumps_exp) {
3246         zSig >>= 1;
3247         zExp++;
3248     }
3249 
3250     if (zExp < -10) {
3251         return packFloat16(zSign, 0, 0);
3252     }
3253     if (zExp < 0) {
3254         zSig >>= -zExp;
3255         zExp = 0;
3256     }
3257     return packFloat16(zSign, zExp, zSig >> 13);
3258 }
3259 
3260 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3261                                       uint32_t *zSigPtr)
3262 {
3263     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3264     *zSigPtr = aSig << shiftCount;
3265     *zExpPtr = 1 - shiftCount;
3266 }
3267 
3268 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3269    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3270 
3271 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3272 {
3273     flag aSign;
3274     int_fast16_t aExp;
3275     uint32_t aSig;
3276 
3277     aSign = extractFloat16Sign(a);
3278     aExp = extractFloat16Exp(a);
3279     aSig = extractFloat16Frac(a);
3280 
3281     if (aExp == 0x1f && ieee) {
3282         if (aSig) {
3283             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3284         }
3285         return packFloat32(aSign, 0xff, 0);
3286     }
3287     if (aExp == 0) {
3288         if (aSig == 0) {
3289             return packFloat32(aSign, 0, 0);
3290         }
3291 
3292         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3293         aExp--;
3294     }
3295     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3296 }
3297 
3298 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3299 {
3300     flag aSign;
3301     int_fast16_t aExp;
3302     uint32_t aSig;
3303 
3304     a = float32_squash_input_denormal(a STATUS_VAR);
3305 
3306     aSig = extractFloat32Frac( a );
3307     aExp = extractFloat32Exp( a );
3308     aSign = extractFloat32Sign( a );
3309     if ( aExp == 0xFF ) {
3310         if (aSig) {
3311             /* Input is a NaN */
3312             if (!ieee) {
3313                 float_raise(float_flag_invalid STATUS_VAR);
3314                 return packFloat16(aSign, 0, 0);
3315             }
3316             return commonNaNToFloat16(
3317                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3318         }
3319         /* Infinity */
3320         if (!ieee) {
3321             float_raise(float_flag_invalid STATUS_VAR);
3322             return packFloat16(aSign, 0x1f, 0x3ff);
3323         }
3324         return packFloat16(aSign, 0x1f, 0);
3325     }
3326     if (aExp == 0 && aSig == 0) {
3327         return packFloat16(aSign, 0, 0);
3328     }
3329     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3330      * even if the input is denormal; however this is harmless because
3331      * the largest possible single-precision denormal is still smaller
3332      * than the smallest representable half-precision denormal, and so we
3333      * will end up ignoring aSig and returning via the "always return zero"
3334      * codepath.
3335      */
3336     aSig |= 0x00800000;
3337     aExp -= 0x71;
3338 
3339     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3340 }
3341 
3342 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3343 {
3344     flag aSign;
3345     int_fast16_t aExp;
3346     uint32_t aSig;
3347 
3348     aSign = extractFloat16Sign(a);
3349     aExp = extractFloat16Exp(a);
3350     aSig = extractFloat16Frac(a);
3351 
3352     if (aExp == 0x1f && ieee) {
3353         if (aSig) {
3354             return commonNaNToFloat64(
3355                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3356         }
3357         return packFloat64(aSign, 0x7ff, 0);
3358     }
3359     if (aExp == 0) {
3360         if (aSig == 0) {
3361             return packFloat64(aSign, 0, 0);
3362         }
3363 
3364         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3365         aExp--;
3366     }
3367     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3368 }
3369 
3370 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3371 {
3372     flag aSign;
3373     int_fast16_t aExp;
3374     uint64_t aSig;
3375     uint32_t zSig;
3376 
3377     a = float64_squash_input_denormal(a STATUS_VAR);
3378 
3379     aSig = extractFloat64Frac(a);
3380     aExp = extractFloat64Exp(a);
3381     aSign = extractFloat64Sign(a);
3382     if (aExp == 0x7FF) {
3383         if (aSig) {
3384             /* Input is a NaN */
3385             if (!ieee) {
3386                 float_raise(float_flag_invalid STATUS_VAR);
3387                 return packFloat16(aSign, 0, 0);
3388             }
3389             return commonNaNToFloat16(
3390                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3391         }
3392         /* Infinity */
3393         if (!ieee) {
3394             float_raise(float_flag_invalid STATUS_VAR);
3395             return packFloat16(aSign, 0x1f, 0x3ff);
3396         }
3397         return packFloat16(aSign, 0x1f, 0);
3398     }
3399     shift64RightJamming(aSig, 29, &aSig);
3400     zSig = aSig;
3401     if (aExp == 0 && zSig == 0) {
3402         return packFloat16(aSign, 0, 0);
3403     }
3404     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3405      * even if the input is denormal; however this is harmless because
3406      * the largest possible single-precision denormal is still smaller
3407      * than the smallest representable half-precision denormal, and so we
3408      * will end up ignoring aSig and returning via the "always return zero"
3409      * codepath.
3410      */
3411     zSig |= 0x00800000;
3412     aExp -= 0x3F1;
3413 
3414     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3415 }
3416 
3417 /*----------------------------------------------------------------------------
3418 | Returns the result of converting the double-precision floating-point value
3419 | `a' to the extended double-precision floating-point format.  The conversion
3420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3421 | Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423 
3424 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3425 {
3426     flag aSign;
3427     int_fast16_t aExp;
3428     uint64_t aSig;
3429 
3430     a = float64_squash_input_denormal(a STATUS_VAR);
3431     aSig = extractFloat64Frac( a );
3432     aExp = extractFloat64Exp( a );
3433     aSign = extractFloat64Sign( a );
3434     if ( aExp == 0x7FF ) {
3435         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3436         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3437     }
3438     if ( aExp == 0 ) {
3439         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3440         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3441     }
3442     return
3443         packFloatx80(
3444             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3445 
3446 }
3447 
3448 /*----------------------------------------------------------------------------
3449 | Returns the result of converting the double-precision floating-point value
3450 | `a' to the quadruple-precision floating-point format.  The conversion is
3451 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3452 | Arithmetic.
3453 *----------------------------------------------------------------------------*/
3454 
3455 float128 float64_to_float128( float64 a STATUS_PARAM )
3456 {
3457     flag aSign;
3458     int_fast16_t aExp;
3459     uint64_t aSig, zSig0, zSig1;
3460 
3461     a = float64_squash_input_denormal(a STATUS_VAR);
3462     aSig = extractFloat64Frac( a );
3463     aExp = extractFloat64Exp( a );
3464     aSign = extractFloat64Sign( a );
3465     if ( aExp == 0x7FF ) {
3466         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3467         return packFloat128( aSign, 0x7FFF, 0, 0 );
3468     }
3469     if ( aExp == 0 ) {
3470         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3471         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3472         --aExp;
3473     }
3474     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3475     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3476 
3477 }
3478 
3479 /*----------------------------------------------------------------------------
3480 | Rounds the double-precision floating-point value `a' to an integer, and
3481 | returns the result as a double-precision floating-point value.  The
3482 | operation is performed according to the IEC/IEEE Standard for Binary
3483 | Floating-Point Arithmetic.
3484 *----------------------------------------------------------------------------*/
3485 
3486 float64 float64_round_to_int( float64 a STATUS_PARAM )
3487 {
3488     flag aSign;
3489     int_fast16_t aExp;
3490     uint64_t lastBitMask, roundBitsMask;
3491     uint64_t z;
3492     a = float64_squash_input_denormal(a STATUS_VAR);
3493 
3494     aExp = extractFloat64Exp( a );
3495     if ( 0x433 <= aExp ) {
3496         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3497             return propagateFloat64NaN( a, a STATUS_VAR );
3498         }
3499         return a;
3500     }
3501     if ( aExp < 0x3FF ) {
3502         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3503         STATUS(float_exception_flags) |= float_flag_inexact;
3504         aSign = extractFloat64Sign( a );
3505         switch ( STATUS(float_rounding_mode) ) {
3506          case float_round_nearest_even:
3507             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3508                 return packFloat64( aSign, 0x3FF, 0 );
3509             }
3510             break;
3511         case float_round_ties_away:
3512             if (aExp == 0x3FE) {
3513                 return packFloat64(aSign, 0x3ff, 0);
3514             }
3515             break;
3516          case float_round_down:
3517             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3518          case float_round_up:
3519             return make_float64(
3520             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3521         }
3522         return packFloat64( aSign, 0, 0 );
3523     }
3524     lastBitMask = 1;
3525     lastBitMask <<= 0x433 - aExp;
3526     roundBitsMask = lastBitMask - 1;
3527     z = float64_val(a);
3528     switch (STATUS(float_rounding_mode)) {
3529     case float_round_nearest_even:
3530         z += lastBitMask >> 1;
3531         if ((z & roundBitsMask) == 0) {
3532             z &= ~lastBitMask;
3533         }
3534         break;
3535     case float_round_ties_away:
3536         z += lastBitMask >> 1;
3537         break;
3538     case float_round_to_zero:
3539         break;
3540     case float_round_up:
3541         if (!extractFloat64Sign(make_float64(z))) {
3542             z += roundBitsMask;
3543         }
3544         break;
3545     case float_round_down:
3546         if (extractFloat64Sign(make_float64(z))) {
3547             z += roundBitsMask;
3548         }
3549         break;
3550     default:
3551         abort();
3552     }
3553     z &= ~ roundBitsMask;
3554     if ( z != float64_val(a) )
3555         STATUS(float_exception_flags) |= float_flag_inexact;
3556     return make_float64(z);
3557 
3558 }
3559 
3560 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3561 {
3562     int oldmode;
3563     float64 res;
3564     oldmode = STATUS(float_rounding_mode);
3565     STATUS(float_rounding_mode) = float_round_to_zero;
3566     res = float64_round_to_int(a STATUS_VAR);
3567     STATUS(float_rounding_mode) = oldmode;
3568     return res;
3569 }
3570 
3571 /*----------------------------------------------------------------------------
3572 | Returns the result of adding the absolute values of the double-precision
3573 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3574 | before being returned.  `zSign' is ignored if the result is a NaN.
3575 | The addition is performed according to the IEC/IEEE Standard for Binary
3576 | Floating-Point Arithmetic.
3577 *----------------------------------------------------------------------------*/
3578 
3579 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3580 {
3581     int_fast16_t aExp, bExp, zExp;
3582     uint64_t aSig, bSig, zSig;
3583     int_fast16_t expDiff;
3584 
3585     aSig = extractFloat64Frac( a );
3586     aExp = extractFloat64Exp( a );
3587     bSig = extractFloat64Frac( b );
3588     bExp = extractFloat64Exp( b );
3589     expDiff = aExp - bExp;
3590     aSig <<= 9;
3591     bSig <<= 9;
3592     if ( 0 < expDiff ) {
3593         if ( aExp == 0x7FF ) {
3594             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3595             return a;
3596         }
3597         if ( bExp == 0 ) {
3598             --expDiff;
3599         }
3600         else {
3601             bSig |= LIT64( 0x2000000000000000 );
3602         }
3603         shift64RightJamming( bSig, expDiff, &bSig );
3604         zExp = aExp;
3605     }
3606     else if ( expDiff < 0 ) {
3607         if ( bExp == 0x7FF ) {
3608             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3609             return packFloat64( zSign, 0x7FF, 0 );
3610         }
3611         if ( aExp == 0 ) {
3612             ++expDiff;
3613         }
3614         else {
3615             aSig |= LIT64( 0x2000000000000000 );
3616         }
3617         shift64RightJamming( aSig, - expDiff, &aSig );
3618         zExp = bExp;
3619     }
3620     else {
3621         if ( aExp == 0x7FF ) {
3622             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3623             return a;
3624         }
3625         if ( aExp == 0 ) {
3626             if (STATUS(flush_to_zero)) {
3627                 if (aSig | bSig) {
3628                     float_raise(float_flag_output_denormal STATUS_VAR);
3629                 }
3630                 return packFloat64(zSign, 0, 0);
3631             }
3632             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3633         }
3634         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3635         zExp = aExp;
3636         goto roundAndPack;
3637     }
3638     aSig |= LIT64( 0x2000000000000000 );
3639     zSig = ( aSig + bSig )<<1;
3640     --zExp;
3641     if ( (int64_t) zSig < 0 ) {
3642         zSig = aSig + bSig;
3643         ++zExp;
3644     }
3645  roundAndPack:
3646     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3647 
3648 }
3649 
3650 /*----------------------------------------------------------------------------
3651 | Returns the result of subtracting the absolute values of the double-
3652 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3653 | difference is negated before being returned.  `zSign' is ignored if the
3654 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3655 | Standard for Binary Floating-Point Arithmetic.
3656 *----------------------------------------------------------------------------*/
3657 
3658 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3659 {
3660     int_fast16_t aExp, bExp, zExp;
3661     uint64_t aSig, bSig, zSig;
3662     int_fast16_t expDiff;
3663 
3664     aSig = extractFloat64Frac( a );
3665     aExp = extractFloat64Exp( a );
3666     bSig = extractFloat64Frac( b );
3667     bExp = extractFloat64Exp( b );
3668     expDiff = aExp - bExp;
3669     aSig <<= 10;
3670     bSig <<= 10;
3671     if ( 0 < expDiff ) goto aExpBigger;
3672     if ( expDiff < 0 ) goto bExpBigger;
3673     if ( aExp == 0x7FF ) {
3674         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3675         float_raise( float_flag_invalid STATUS_VAR);
3676         return float64_default_nan;
3677     }
3678     if ( aExp == 0 ) {
3679         aExp = 1;
3680         bExp = 1;
3681     }
3682     if ( bSig < aSig ) goto aBigger;
3683     if ( aSig < bSig ) goto bBigger;
3684     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3685  bExpBigger:
3686     if ( bExp == 0x7FF ) {
3687         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3688         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3689     }
3690     if ( aExp == 0 ) {
3691         ++expDiff;
3692     }
3693     else {
3694         aSig |= LIT64( 0x4000000000000000 );
3695     }
3696     shift64RightJamming( aSig, - expDiff, &aSig );
3697     bSig |= LIT64( 0x4000000000000000 );
3698  bBigger:
3699     zSig = bSig - aSig;
3700     zExp = bExp;
3701     zSign ^= 1;
3702     goto normalizeRoundAndPack;
3703  aExpBigger:
3704     if ( aExp == 0x7FF ) {
3705         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3706         return a;
3707     }
3708     if ( bExp == 0 ) {
3709         --expDiff;
3710     }
3711     else {
3712         bSig |= LIT64( 0x4000000000000000 );
3713     }
3714     shift64RightJamming( bSig, expDiff, &bSig );
3715     aSig |= LIT64( 0x4000000000000000 );
3716  aBigger:
3717     zSig = aSig - bSig;
3718     zExp = aExp;
3719  normalizeRoundAndPack:
3720     --zExp;
3721     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3722 
3723 }
3724 
3725 /*----------------------------------------------------------------------------
3726 | Returns the result of adding the double-precision floating-point values `a'
3727 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3728 | Binary Floating-Point Arithmetic.
3729 *----------------------------------------------------------------------------*/
3730 
3731 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3732 {
3733     flag aSign, bSign;
3734     a = float64_squash_input_denormal(a STATUS_VAR);
3735     b = float64_squash_input_denormal(b STATUS_VAR);
3736 
3737     aSign = extractFloat64Sign( a );
3738     bSign = extractFloat64Sign( b );
3739     if ( aSign == bSign ) {
3740         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3741     }
3742     else {
3743         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3744     }
3745 
3746 }
3747 
3748 /*----------------------------------------------------------------------------
3749 | Returns the result of subtracting the double-precision floating-point values
3750 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3751 | for Binary Floating-Point Arithmetic.
3752 *----------------------------------------------------------------------------*/
3753 
3754 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3755 {
3756     flag aSign, bSign;
3757     a = float64_squash_input_denormal(a STATUS_VAR);
3758     b = float64_squash_input_denormal(b STATUS_VAR);
3759 
3760     aSign = extractFloat64Sign( a );
3761     bSign = extractFloat64Sign( b );
3762     if ( aSign == bSign ) {
3763         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3764     }
3765     else {
3766         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3767     }
3768 
3769 }
3770 
3771 /*----------------------------------------------------------------------------
3772 | Returns the result of multiplying the double-precision floating-point values
3773 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3774 | for Binary Floating-Point Arithmetic.
3775 *----------------------------------------------------------------------------*/
3776 
3777 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3778 {
3779     flag aSign, bSign, zSign;
3780     int_fast16_t aExp, bExp, zExp;
3781     uint64_t aSig, bSig, zSig0, zSig1;
3782 
3783     a = float64_squash_input_denormal(a STATUS_VAR);
3784     b = float64_squash_input_denormal(b STATUS_VAR);
3785 
3786     aSig = extractFloat64Frac( a );
3787     aExp = extractFloat64Exp( a );
3788     aSign = extractFloat64Sign( a );
3789     bSig = extractFloat64Frac( b );
3790     bExp = extractFloat64Exp( b );
3791     bSign = extractFloat64Sign( b );
3792     zSign = aSign ^ bSign;
3793     if ( aExp == 0x7FF ) {
3794         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3795             return propagateFloat64NaN( a, b STATUS_VAR );
3796         }
3797         if ( ( bExp | bSig ) == 0 ) {
3798             float_raise( float_flag_invalid STATUS_VAR);
3799             return float64_default_nan;
3800         }
3801         return packFloat64( zSign, 0x7FF, 0 );
3802     }
3803     if ( bExp == 0x7FF ) {
3804         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3805         if ( ( aExp | aSig ) == 0 ) {
3806             float_raise( float_flag_invalid STATUS_VAR);
3807             return float64_default_nan;
3808         }
3809         return packFloat64( zSign, 0x7FF, 0 );
3810     }
3811     if ( aExp == 0 ) {
3812         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3813         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3814     }
3815     if ( bExp == 0 ) {
3816         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3817         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3818     }
3819     zExp = aExp + bExp - 0x3FF;
3820     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3821     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3822     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3823     zSig0 |= ( zSig1 != 0 );
3824     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3825         zSig0 <<= 1;
3826         --zExp;
3827     }
3828     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3829 
3830 }
3831 
3832 /*----------------------------------------------------------------------------
3833 | Returns the result of dividing the double-precision floating-point value `a'
3834 | by the corresponding value `b'.  The operation is performed according to
3835 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3836 *----------------------------------------------------------------------------*/
3837 
3838 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3839 {
3840     flag aSign, bSign, zSign;
3841     int_fast16_t aExp, bExp, zExp;
3842     uint64_t aSig, bSig, zSig;
3843     uint64_t rem0, rem1;
3844     uint64_t term0, term1;
3845     a = float64_squash_input_denormal(a STATUS_VAR);
3846     b = float64_squash_input_denormal(b STATUS_VAR);
3847 
3848     aSig = extractFloat64Frac( a );
3849     aExp = extractFloat64Exp( a );
3850     aSign = extractFloat64Sign( a );
3851     bSig = extractFloat64Frac( b );
3852     bExp = extractFloat64Exp( b );
3853     bSign = extractFloat64Sign( b );
3854     zSign = aSign ^ bSign;
3855     if ( aExp == 0x7FF ) {
3856         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3857         if ( bExp == 0x7FF ) {
3858             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3859             float_raise( float_flag_invalid STATUS_VAR);
3860             return float64_default_nan;
3861         }
3862         return packFloat64( zSign, 0x7FF, 0 );
3863     }
3864     if ( bExp == 0x7FF ) {
3865         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3866         return packFloat64( zSign, 0, 0 );
3867     }
3868     if ( bExp == 0 ) {
3869         if ( bSig == 0 ) {
3870             if ( ( aExp | aSig ) == 0 ) {
3871                 float_raise( float_flag_invalid STATUS_VAR);
3872                 return float64_default_nan;
3873             }
3874             float_raise( float_flag_divbyzero STATUS_VAR);
3875             return packFloat64( zSign, 0x7FF, 0 );
3876         }
3877         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3878     }
3879     if ( aExp == 0 ) {
3880         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3881         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3882     }
3883     zExp = aExp - bExp + 0x3FD;
3884     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3885     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3886     if ( bSig <= ( aSig + aSig ) ) {
3887         aSig >>= 1;
3888         ++zExp;
3889     }
3890     zSig = estimateDiv128To64( aSig, 0, bSig );
3891     if ( ( zSig & 0x1FF ) <= 2 ) {
3892         mul64To128( bSig, zSig, &term0, &term1 );
3893         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3894         while ( (int64_t) rem0 < 0 ) {
3895             --zSig;
3896             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3897         }
3898         zSig |= ( rem1 != 0 );
3899     }
3900     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3901 
3902 }
3903 
3904 /*----------------------------------------------------------------------------
3905 | Returns the remainder of the double-precision floating-point value `a'
3906 | with respect to the corresponding value `b'.  The operation is performed
3907 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3908 *----------------------------------------------------------------------------*/
3909 
3910 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3911 {
3912     flag aSign, zSign;
3913     int_fast16_t aExp, bExp, expDiff;
3914     uint64_t aSig, bSig;
3915     uint64_t q, alternateASig;
3916     int64_t sigMean;
3917 
3918     a = float64_squash_input_denormal(a STATUS_VAR);
3919     b = float64_squash_input_denormal(b STATUS_VAR);
3920     aSig = extractFloat64Frac( a );
3921     aExp = extractFloat64Exp( a );
3922     aSign = extractFloat64Sign( a );
3923     bSig = extractFloat64Frac( b );
3924     bExp = extractFloat64Exp( b );
3925     if ( aExp == 0x7FF ) {
3926         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3927             return propagateFloat64NaN( a, b STATUS_VAR );
3928         }
3929         float_raise( float_flag_invalid STATUS_VAR);
3930         return float64_default_nan;
3931     }
3932     if ( bExp == 0x7FF ) {
3933         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3934         return a;
3935     }
3936     if ( bExp == 0 ) {
3937         if ( bSig == 0 ) {
3938             float_raise( float_flag_invalid STATUS_VAR);
3939             return float64_default_nan;
3940         }
3941         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3942     }
3943     if ( aExp == 0 ) {
3944         if ( aSig == 0 ) return a;
3945         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3946     }
3947     expDiff = aExp - bExp;
3948     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3949     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3950     if ( expDiff < 0 ) {
3951         if ( expDiff < -1 ) return a;
3952         aSig >>= 1;
3953     }
3954     q = ( bSig <= aSig );
3955     if ( q ) aSig -= bSig;
3956     expDiff -= 64;
3957     while ( 0 < expDiff ) {
3958         q = estimateDiv128To64( aSig, 0, bSig );
3959         q = ( 2 < q ) ? q - 2 : 0;
3960         aSig = - ( ( bSig>>2 ) * q );
3961         expDiff -= 62;
3962     }
3963     expDiff += 64;
3964     if ( 0 < expDiff ) {
3965         q = estimateDiv128To64( aSig, 0, bSig );
3966         q = ( 2 < q ) ? q - 2 : 0;
3967         q >>= 64 - expDiff;
3968         bSig >>= 2;
3969         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3970     }
3971     else {
3972         aSig >>= 2;
3973         bSig >>= 2;
3974     }
3975     do {
3976         alternateASig = aSig;
3977         ++q;
3978         aSig -= bSig;
3979     } while ( 0 <= (int64_t) aSig );
3980     sigMean = aSig + alternateASig;
3981     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3982         aSig = alternateASig;
3983     }
3984     zSign = ( (int64_t) aSig < 0 );
3985     if ( zSign ) aSig = - aSig;
3986     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3987 
3988 }
3989 
3990 /*----------------------------------------------------------------------------
3991 | Returns the result of multiplying the double-precision floating-point values
3992 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3993 | multiplication.  The operation is performed according to the IEC/IEEE
3994 | Standard for Binary Floating-Point Arithmetic 754-2008.
3995 | The flags argument allows the caller to select negation of the
3996 | addend, the intermediate product, or the final result. (The difference
3997 | between this and having the caller do a separate negation is that negating
3998 | externally will flip the sign bit on NaNs.)
3999 *----------------------------------------------------------------------------*/
4000 
4001 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4002 {
4003     flag aSign, bSign, cSign, zSign;
4004     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4005     uint64_t aSig, bSig, cSig;
4006     flag pInf, pZero, pSign;
4007     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4008     int shiftcount;
4009     flag signflip, infzero;
4010 
4011     a = float64_squash_input_denormal(a STATUS_VAR);
4012     b = float64_squash_input_denormal(b STATUS_VAR);
4013     c = float64_squash_input_denormal(c STATUS_VAR);
4014     aSig = extractFloat64Frac(a);
4015     aExp = extractFloat64Exp(a);
4016     aSign = extractFloat64Sign(a);
4017     bSig = extractFloat64Frac(b);
4018     bExp = extractFloat64Exp(b);
4019     bSign = extractFloat64Sign(b);
4020     cSig = extractFloat64Frac(c);
4021     cExp = extractFloat64Exp(c);
4022     cSign = extractFloat64Sign(c);
4023 
4024     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4025                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4026 
4027     /* It is implementation-defined whether the cases of (0,inf,qnan)
4028      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4029      * they return if they do), so we have to hand this information
4030      * off to the target-specific pick-a-NaN routine.
4031      */
4032     if (((aExp == 0x7ff) && aSig) ||
4033         ((bExp == 0x7ff) && bSig) ||
4034         ((cExp == 0x7ff) && cSig)) {
4035         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4036     }
4037 
4038     if (infzero) {
4039         float_raise(float_flag_invalid STATUS_VAR);
4040         return float64_default_nan;
4041     }
4042 
4043     if (flags & float_muladd_negate_c) {
4044         cSign ^= 1;
4045     }
4046 
4047     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4048 
4049     /* Work out the sign and type of the product */
4050     pSign = aSign ^ bSign;
4051     if (flags & float_muladd_negate_product) {
4052         pSign ^= 1;
4053     }
4054     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4055     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4056 
4057     if (cExp == 0x7ff) {
4058         if (pInf && (pSign ^ cSign)) {
4059             /* addition of opposite-signed infinities => InvalidOperation */
4060             float_raise(float_flag_invalid STATUS_VAR);
4061             return float64_default_nan;
4062         }
4063         /* Otherwise generate an infinity of the same sign */
4064         return packFloat64(cSign ^ signflip, 0x7ff, 0);
4065     }
4066 
4067     if (pInf) {
4068         return packFloat64(pSign ^ signflip, 0x7ff, 0);
4069     }
4070 
4071     if (pZero) {
4072         if (cExp == 0) {
4073             if (cSig == 0) {
4074                 /* Adding two exact zeroes */
4075                 if (pSign == cSign) {
4076                     zSign = pSign;
4077                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4078                     zSign = 1;
4079                 } else {
4080                     zSign = 0;
4081                 }
4082                 return packFloat64(zSign ^ signflip, 0, 0);
4083             }
4084             /* Exact zero plus a denorm */
4085             if (STATUS(flush_to_zero)) {
4086                 float_raise(float_flag_output_denormal STATUS_VAR);
4087                 return packFloat64(cSign ^ signflip, 0, 0);
4088             }
4089         }
4090         /* Zero plus something non-zero : just return the something */
4091         return packFloat64(cSign ^ signflip, cExp, cSig);
4092     }
4093 
4094     if (aExp == 0) {
4095         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4096     }
4097     if (bExp == 0) {
4098         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4099     }
4100 
4101     /* Calculate the actual result a * b + c */
4102 
4103     /* Multiply first; this is easy. */
4104     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4105      * because we want the true exponent, not the "one-less-than"
4106      * flavour that roundAndPackFloat64() takes.
4107      */
4108     pExp = aExp + bExp - 0x3fe;
4109     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4110     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4111     mul64To128(aSig, bSig, &pSig0, &pSig1);
4112     if ((int64_t)(pSig0 << 1) >= 0) {
4113         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4114         pExp--;
4115     }
4116 
4117     zSign = pSign ^ signflip;
4118 
4119     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4120      * bit in position 126.
4121      */
4122     if (cExp == 0) {
4123         if (!cSig) {
4124             /* Throw out the special case of c being an exact zero now */
4125             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4126             return roundAndPackFloat64(zSign, pExp - 1,
4127                                        pSig1 STATUS_VAR);
4128         }
4129         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4130     }
4131 
4132     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4133      * significand of the addend, with the explicit bit in position 126.
4134      */
4135     cSig0 = cSig << (126 - 64 - 52);
4136     cSig1 = 0;
4137     cSig0 |= LIT64(0x4000000000000000);
4138     expDiff = pExp - cExp;
4139 
4140     if (pSign == cSign) {
4141         /* Addition */
4142         if (expDiff > 0) {
4143             /* scale c to match p */
4144             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4145             zExp = pExp;
4146         } else if (expDiff < 0) {
4147             /* scale p to match c */
4148             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4149             zExp = cExp;
4150         } else {
4151             /* no scaling needed */
4152             zExp = cExp;
4153         }
4154         /* Add significands and make sure explicit bit ends up in posn 126 */
4155         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4156         if ((int64_t)zSig0 < 0) {
4157             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4158         } else {
4159             zExp--;
4160         }
4161         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4162         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4163     } else {
4164         /* Subtraction */
4165         if (expDiff > 0) {
4166             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4167             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4168             zExp = pExp;
4169         } else if (expDiff < 0) {
4170             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4171             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4172             zExp = cExp;
4173             zSign ^= 1;
4174         } else {
4175             zExp = pExp;
4176             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4177                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4178             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4179                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4180                 zSign ^= 1;
4181             } else {
4182                 /* Exact zero */
4183                 zSign = signflip;
4184                 if (STATUS(float_rounding_mode) == float_round_down) {
4185                     zSign ^= 1;
4186                 }
4187                 return packFloat64(zSign, 0, 0);
4188             }
4189         }
4190         --zExp;
4191         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4192          * starting with the significand in a pair of uint64_t.
4193          */
4194         if (zSig0) {
4195             shiftcount = countLeadingZeros64(zSig0) - 1;
4196             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4197             if (zSig1) {
4198                 zSig0 |= 1;
4199             }
4200             zExp -= shiftcount;
4201         } else {
4202             shiftcount = countLeadingZeros64(zSig1);
4203             if (shiftcount == 0) {
4204                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4205                 zExp -= 63;
4206             } else {
4207                 shiftcount--;
4208                 zSig0 = zSig1 << shiftcount;
4209                 zExp -= (shiftcount + 64);
4210             }
4211         }
4212         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4213     }
4214 }
4215 
4216 /*----------------------------------------------------------------------------
4217 | Returns the square root of the double-precision floating-point value `a'.
4218 | The operation is performed according to the IEC/IEEE Standard for Binary
4219 | Floating-Point Arithmetic.
4220 *----------------------------------------------------------------------------*/
4221 
4222 float64 float64_sqrt( float64 a STATUS_PARAM )
4223 {
4224     flag aSign;
4225     int_fast16_t aExp, zExp;
4226     uint64_t aSig, zSig, doubleZSig;
4227     uint64_t rem0, rem1, term0, term1;
4228     a = float64_squash_input_denormal(a STATUS_VAR);
4229 
4230     aSig = extractFloat64Frac( a );
4231     aExp = extractFloat64Exp( a );
4232     aSign = extractFloat64Sign( a );
4233     if ( aExp == 0x7FF ) {
4234         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4235         if ( ! aSign ) return a;
4236         float_raise( float_flag_invalid STATUS_VAR);
4237         return float64_default_nan;
4238     }
4239     if ( aSign ) {
4240         if ( ( aExp | aSig ) == 0 ) return a;
4241         float_raise( float_flag_invalid STATUS_VAR);
4242         return float64_default_nan;
4243     }
4244     if ( aExp == 0 ) {
4245         if ( aSig == 0 ) return float64_zero;
4246         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4247     }
4248     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4249     aSig |= LIT64( 0x0010000000000000 );
4250     zSig = estimateSqrt32( aExp, aSig>>21 );
4251     aSig <<= 9 - ( aExp & 1 );
4252     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4253     if ( ( zSig & 0x1FF ) <= 5 ) {
4254         doubleZSig = zSig<<1;
4255         mul64To128( zSig, zSig, &term0, &term1 );
4256         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4257         while ( (int64_t) rem0 < 0 ) {
4258             --zSig;
4259             doubleZSig -= 2;
4260             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4261         }
4262         zSig |= ( ( rem0 | rem1 ) != 0 );
4263     }
4264     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4265 
4266 }
4267 
4268 /*----------------------------------------------------------------------------
4269 | Returns the binary log of the double-precision floating-point value `a'.
4270 | The operation is performed according to the IEC/IEEE Standard for Binary
4271 | Floating-Point Arithmetic.
4272 *----------------------------------------------------------------------------*/
4273 float64 float64_log2( float64 a STATUS_PARAM )
4274 {
4275     flag aSign, zSign;
4276     int_fast16_t aExp;
4277     uint64_t aSig, aSig0, aSig1, zSig, i;
4278     a = float64_squash_input_denormal(a STATUS_VAR);
4279 
4280     aSig = extractFloat64Frac( a );
4281     aExp = extractFloat64Exp( a );
4282     aSign = extractFloat64Sign( a );
4283 
4284     if ( aExp == 0 ) {
4285         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4286         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4287     }
4288     if ( aSign ) {
4289         float_raise( float_flag_invalid STATUS_VAR);
4290         return float64_default_nan;
4291     }
4292     if ( aExp == 0x7FF ) {
4293         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4294         return a;
4295     }
4296 
4297     aExp -= 0x3FF;
4298     aSig |= LIT64( 0x0010000000000000 );
4299     zSign = aExp < 0;
4300     zSig = (uint64_t)aExp << 52;
4301     for (i = 1LL << 51; i > 0; i >>= 1) {
4302         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4303         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4304         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4305             aSig >>= 1;
4306             zSig |= i;
4307         }
4308     }
4309 
4310     if ( zSign )
4311         zSig = -zSig;
4312     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4313 }
4314 
4315 /*----------------------------------------------------------------------------
4316 | Returns 1 if the double-precision floating-point value `a' is equal to the
4317 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4318 | if either operand is a NaN.  Otherwise, the comparison is performed
4319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4320 *----------------------------------------------------------------------------*/
4321 
4322 int float64_eq( float64 a, float64 b STATUS_PARAM )
4323 {
4324     uint64_t av, bv;
4325     a = float64_squash_input_denormal(a STATUS_VAR);
4326     b = float64_squash_input_denormal(b STATUS_VAR);
4327 
4328     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4329          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4330        ) {
4331         float_raise( float_flag_invalid STATUS_VAR);
4332         return 0;
4333     }
4334     av = float64_val(a);
4335     bv = float64_val(b);
4336     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4337 
4338 }
4339 
4340 /*----------------------------------------------------------------------------
4341 | Returns 1 if the double-precision floating-point value `a' is less than or
4342 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4343 | exception is raised if either operand is a NaN.  The comparison is performed
4344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4345 *----------------------------------------------------------------------------*/
4346 
4347 int float64_le( float64 a, float64 b STATUS_PARAM )
4348 {
4349     flag aSign, bSign;
4350     uint64_t av, bv;
4351     a = float64_squash_input_denormal(a STATUS_VAR);
4352     b = float64_squash_input_denormal(b STATUS_VAR);
4353 
4354     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4355          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4356        ) {
4357         float_raise( float_flag_invalid STATUS_VAR);
4358         return 0;
4359     }
4360     aSign = extractFloat64Sign( a );
4361     bSign = extractFloat64Sign( b );
4362     av = float64_val(a);
4363     bv = float64_val(b);
4364     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4365     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4366 
4367 }
4368 
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than
4371 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4372 | raised if either operand is a NaN.  The comparison is performed according
4373 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375 
4376 int float64_lt( float64 a, float64 b STATUS_PARAM )
4377 {
4378     flag aSign, bSign;
4379     uint64_t av, bv;
4380 
4381     a = float64_squash_input_denormal(a STATUS_VAR);
4382     b = float64_squash_input_denormal(b STATUS_VAR);
4383     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4385        ) {
4386         float_raise( float_flag_invalid STATUS_VAR);
4387         return 0;
4388     }
4389     aSign = extractFloat64Sign( a );
4390     bSign = extractFloat64Sign( b );
4391     av = float64_val(a);
4392     bv = float64_val(b);
4393     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4394     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4395 
4396 }
4397 
4398 /*----------------------------------------------------------------------------
4399 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4400 | be compared, and 0 otherwise.  The invalid exception is raised if either
4401 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4402 | Standard for Binary Floating-Point Arithmetic.
4403 *----------------------------------------------------------------------------*/
4404 
4405 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4406 {
4407     a = float64_squash_input_denormal(a STATUS_VAR);
4408     b = float64_squash_input_denormal(b STATUS_VAR);
4409 
4410     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4411          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4412        ) {
4413         float_raise( float_flag_invalid STATUS_VAR);
4414         return 1;
4415     }
4416     return 0;
4417 }
4418 
4419 /*----------------------------------------------------------------------------
4420 | Returns 1 if the double-precision floating-point value `a' is equal to the
4421 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4422 | exception.The comparison is performed according to the IEC/IEEE Standard
4423 | for Binary Floating-Point Arithmetic.
4424 *----------------------------------------------------------------------------*/
4425 
4426 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4427 {
4428     uint64_t av, bv;
4429     a = float64_squash_input_denormal(a STATUS_VAR);
4430     b = float64_squash_input_denormal(b STATUS_VAR);
4431 
4432     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4433          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4434        ) {
4435         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4436             float_raise( float_flag_invalid STATUS_VAR);
4437         }
4438         return 0;
4439     }
4440     av = float64_val(a);
4441     bv = float64_val(b);
4442     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4443 
4444 }
4445 
4446 /*----------------------------------------------------------------------------
4447 | Returns 1 if the double-precision floating-point value `a' is less than or
4448 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4449 | cause an exception.  Otherwise, the comparison is performed according to the
4450 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4451 *----------------------------------------------------------------------------*/
4452 
4453 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4454 {
4455     flag aSign, bSign;
4456     uint64_t av, bv;
4457     a = float64_squash_input_denormal(a STATUS_VAR);
4458     b = float64_squash_input_denormal(b STATUS_VAR);
4459 
4460     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4461          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4462        ) {
4463         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4464             float_raise( float_flag_invalid STATUS_VAR);
4465         }
4466         return 0;
4467     }
4468     aSign = extractFloat64Sign( a );
4469     bSign = extractFloat64Sign( b );
4470     av = float64_val(a);
4471     bv = float64_val(b);
4472     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4473     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4474 
4475 }
4476 
4477 /*----------------------------------------------------------------------------
4478 | Returns 1 if the double-precision floating-point value `a' is less than
4479 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4480 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4481 | Standard for Binary Floating-Point Arithmetic.
4482 *----------------------------------------------------------------------------*/
4483 
4484 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4485 {
4486     flag aSign, bSign;
4487     uint64_t av, bv;
4488     a = float64_squash_input_denormal(a STATUS_VAR);
4489     b = float64_squash_input_denormal(b STATUS_VAR);
4490 
4491     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4492          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4493        ) {
4494         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4495             float_raise( float_flag_invalid STATUS_VAR);
4496         }
4497         return 0;
4498     }
4499     aSign = extractFloat64Sign( a );
4500     bSign = extractFloat64Sign( b );
4501     av = float64_val(a);
4502     bv = float64_val(b);
4503     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4504     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4505 
4506 }
4507 
4508 /*----------------------------------------------------------------------------
4509 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4510 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4511 | comparison is performed according to the IEC/IEEE Standard for Binary
4512 | Floating-Point Arithmetic.
4513 *----------------------------------------------------------------------------*/
4514 
4515 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4516 {
4517     a = float64_squash_input_denormal(a STATUS_VAR);
4518     b = float64_squash_input_denormal(b STATUS_VAR);
4519 
4520     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4521          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4522        ) {
4523         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4524             float_raise( float_flag_invalid STATUS_VAR);
4525         }
4526         return 1;
4527     }
4528     return 0;
4529 }
4530 
4531 /*----------------------------------------------------------------------------
4532 | Returns the result of converting the extended double-precision floating-
4533 | point value `a' to the 32-bit two's complement integer format.  The
4534 | conversion is performed according to the IEC/IEEE Standard for Binary
4535 | Floating-Point Arithmetic---which means in particular that the conversion
4536 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4537 | largest positive integer is returned.  Otherwise, if the conversion
4538 | overflows, the largest integer with the same sign as `a' is returned.
4539 *----------------------------------------------------------------------------*/
4540 
4541 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4542 {
4543     flag aSign;
4544     int32 aExp, shiftCount;
4545     uint64_t aSig;
4546 
4547     aSig = extractFloatx80Frac( a );
4548     aExp = extractFloatx80Exp( a );
4549     aSign = extractFloatx80Sign( a );
4550     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4551     shiftCount = 0x4037 - aExp;
4552     if ( shiftCount <= 0 ) shiftCount = 1;
4553     shift64RightJamming( aSig, shiftCount, &aSig );
4554     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4555 
4556 }
4557 
4558 /*----------------------------------------------------------------------------
4559 | Returns the result of converting the extended double-precision floating-
4560 | point value `a' to the 32-bit two's complement integer format.  The
4561 | conversion is performed according to the IEC/IEEE Standard for Binary
4562 | Floating-Point Arithmetic, except that the conversion is always rounded
4563 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4564 | Otherwise, if the conversion overflows, the largest integer with the same
4565 | sign as `a' is returned.
4566 *----------------------------------------------------------------------------*/
4567 
4568 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4569 {
4570     flag aSign;
4571     int32 aExp, shiftCount;
4572     uint64_t aSig, savedASig;
4573     int32_t z;
4574 
4575     aSig = extractFloatx80Frac( a );
4576     aExp = extractFloatx80Exp( a );
4577     aSign = extractFloatx80Sign( a );
4578     if ( 0x401E < aExp ) {
4579         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4580         goto invalid;
4581     }
4582     else if ( aExp < 0x3FFF ) {
4583         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4584         return 0;
4585     }
4586     shiftCount = 0x403E - aExp;
4587     savedASig = aSig;
4588     aSig >>= shiftCount;
4589     z = aSig;
4590     if ( aSign ) z = - z;
4591     if ( ( z < 0 ) ^ aSign ) {
4592  invalid:
4593         float_raise( float_flag_invalid STATUS_VAR);
4594         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4595     }
4596     if ( ( aSig<<shiftCount ) != savedASig ) {
4597         STATUS(float_exception_flags) |= float_flag_inexact;
4598     }
4599     return z;
4600 
4601 }
4602 
4603 /*----------------------------------------------------------------------------
4604 | Returns the result of converting the extended double-precision floating-
4605 | point value `a' to the 64-bit two's complement integer format.  The
4606 | conversion is performed according to the IEC/IEEE Standard for Binary
4607 | Floating-Point Arithmetic---which means in particular that the conversion
4608 | is rounded according to the current rounding mode.  If `a' is a NaN,
4609 | the largest positive integer is returned.  Otherwise, if the conversion
4610 | overflows, the largest integer with the same sign as `a' is returned.
4611 *----------------------------------------------------------------------------*/
4612 
4613 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4614 {
4615     flag aSign;
4616     int32 aExp, shiftCount;
4617     uint64_t aSig, aSigExtra;
4618 
4619     aSig = extractFloatx80Frac( a );
4620     aExp = extractFloatx80Exp( a );
4621     aSign = extractFloatx80Sign( a );
4622     shiftCount = 0x403E - aExp;
4623     if ( shiftCount <= 0 ) {
4624         if ( shiftCount ) {
4625             float_raise( float_flag_invalid STATUS_VAR);
4626             if (    ! aSign
4627                  || (    ( aExp == 0x7FFF )
4628                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4629                ) {
4630                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4631             }
4632             return (int64_t) LIT64( 0x8000000000000000 );
4633         }
4634         aSigExtra = 0;
4635     }
4636     else {
4637         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4638     }
4639     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4640 
4641 }
4642 
4643 /*----------------------------------------------------------------------------
4644 | Returns the result of converting the extended double-precision floating-
4645 | point value `a' to the 64-bit two's complement integer format.  The
4646 | conversion is performed according to the IEC/IEEE Standard for Binary
4647 | Floating-Point Arithmetic, except that the conversion is always rounded
4648 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4649 | Otherwise, if the conversion overflows, the largest integer with the same
4650 | sign as `a' is returned.
4651 *----------------------------------------------------------------------------*/
4652 
4653 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4654 {
4655     flag aSign;
4656     int32 aExp, shiftCount;
4657     uint64_t aSig;
4658     int64 z;
4659 
4660     aSig = extractFloatx80Frac( a );
4661     aExp = extractFloatx80Exp( a );
4662     aSign = extractFloatx80Sign( a );
4663     shiftCount = aExp - 0x403E;
4664     if ( 0 <= shiftCount ) {
4665         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4666         if ( ( a.high != 0xC03E ) || aSig ) {
4667             float_raise( float_flag_invalid STATUS_VAR);
4668             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4669                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4670             }
4671         }
4672         return (int64_t) LIT64( 0x8000000000000000 );
4673     }
4674     else if ( aExp < 0x3FFF ) {
4675         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4676         return 0;
4677     }
4678     z = aSig>>( - shiftCount );
4679     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4680         STATUS(float_exception_flags) |= float_flag_inexact;
4681     }
4682     if ( aSign ) z = - z;
4683     return z;
4684 
4685 }
4686 
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of converting the extended double-precision floating-
4689 | point value `a' to the single-precision floating-point format.  The
4690 | conversion is performed according to the IEC/IEEE Standard for Binary
4691 | Floating-Point Arithmetic.
4692 *----------------------------------------------------------------------------*/
4693 
4694 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4695 {
4696     flag aSign;
4697     int32 aExp;
4698     uint64_t aSig;
4699 
4700     aSig = extractFloatx80Frac( a );
4701     aExp = extractFloatx80Exp( a );
4702     aSign = extractFloatx80Sign( a );
4703     if ( aExp == 0x7FFF ) {
4704         if ( (uint64_t) ( aSig<<1 ) ) {
4705             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4706         }
4707         return packFloat32( aSign, 0xFF, 0 );
4708     }
4709     shift64RightJamming( aSig, 33, &aSig );
4710     if ( aExp || aSig ) aExp -= 0x3F81;
4711     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4712 
4713 }
4714 
4715 /*----------------------------------------------------------------------------
4716 | Returns the result of converting the extended double-precision floating-
4717 | point value `a' to the double-precision floating-point format.  The
4718 | conversion is performed according to the IEC/IEEE Standard for Binary
4719 | Floating-Point Arithmetic.
4720 *----------------------------------------------------------------------------*/
4721 
4722 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4723 {
4724     flag aSign;
4725     int32 aExp;
4726     uint64_t aSig, zSig;
4727 
4728     aSig = extractFloatx80Frac( a );
4729     aExp = extractFloatx80Exp( a );
4730     aSign = extractFloatx80Sign( a );
4731     if ( aExp == 0x7FFF ) {
4732         if ( (uint64_t) ( aSig<<1 ) ) {
4733             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4734         }
4735         return packFloat64( aSign, 0x7FF, 0 );
4736     }
4737     shift64RightJamming( aSig, 1, &zSig );
4738     if ( aExp || aSig ) aExp -= 0x3C01;
4739     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4740 
4741 }
4742 
4743 /*----------------------------------------------------------------------------
4744 | Returns the result of converting the extended double-precision floating-
4745 | point value `a' to the quadruple-precision floating-point format.  The
4746 | conversion is performed according to the IEC/IEEE Standard for Binary
4747 | Floating-Point Arithmetic.
4748 *----------------------------------------------------------------------------*/
4749 
4750 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4751 {
4752     flag aSign;
4753     int_fast16_t aExp;
4754     uint64_t aSig, zSig0, zSig1;
4755 
4756     aSig = extractFloatx80Frac( a );
4757     aExp = extractFloatx80Exp( a );
4758     aSign = extractFloatx80Sign( a );
4759     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4760         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4761     }
4762     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4763     return packFloat128( aSign, aExp, zSig0, zSig1 );
4764 
4765 }
4766 
4767 /*----------------------------------------------------------------------------
4768 | Rounds the extended double-precision floating-point value `a' to an integer,
4769 | and returns the result as an extended quadruple-precision floating-point
4770 | value.  The operation is performed according to the IEC/IEEE Standard for
4771 | Binary Floating-Point Arithmetic.
4772 *----------------------------------------------------------------------------*/
4773 
4774 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4775 {
4776     flag aSign;
4777     int32 aExp;
4778     uint64_t lastBitMask, roundBitsMask;
4779     floatx80 z;
4780 
4781     aExp = extractFloatx80Exp( a );
4782     if ( 0x403E <= aExp ) {
4783         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4784             return propagateFloatx80NaN( a, a STATUS_VAR );
4785         }
4786         return a;
4787     }
4788     if ( aExp < 0x3FFF ) {
4789         if (    ( aExp == 0 )
4790              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4791             return a;
4792         }
4793         STATUS(float_exception_flags) |= float_flag_inexact;
4794         aSign = extractFloatx80Sign( a );
4795         switch ( STATUS(float_rounding_mode) ) {
4796          case float_round_nearest_even:
4797             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4798                ) {
4799                 return
4800                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4801             }
4802             break;
4803         case float_round_ties_away:
4804             if (aExp == 0x3FFE) {
4805                 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4806             }
4807             break;
4808          case float_round_down:
4809             return
4810                   aSign ?
4811                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4812                 : packFloatx80( 0, 0, 0 );
4813          case float_round_up:
4814             return
4815                   aSign ? packFloatx80( 1, 0, 0 )
4816                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4817         }
4818         return packFloatx80( aSign, 0, 0 );
4819     }
4820     lastBitMask = 1;
4821     lastBitMask <<= 0x403E - aExp;
4822     roundBitsMask = lastBitMask - 1;
4823     z = a;
4824     switch (STATUS(float_rounding_mode)) {
4825     case float_round_nearest_even:
4826         z.low += lastBitMask>>1;
4827         if ((z.low & roundBitsMask) == 0) {
4828             z.low &= ~lastBitMask;
4829         }
4830         break;
4831     case float_round_ties_away:
4832         z.low += lastBitMask >> 1;
4833         break;
4834     case float_round_to_zero:
4835         break;
4836     case float_round_up:
4837         if (!extractFloatx80Sign(z)) {
4838             z.low += roundBitsMask;
4839         }
4840         break;
4841     case float_round_down:
4842         if (extractFloatx80Sign(z)) {
4843             z.low += roundBitsMask;
4844         }
4845         break;
4846     default:
4847         abort();
4848     }
4849     z.low &= ~ roundBitsMask;
4850     if ( z.low == 0 ) {
4851         ++z.high;
4852         z.low = LIT64( 0x8000000000000000 );
4853     }
4854     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4855     return z;
4856 
4857 }
4858 
4859 /*----------------------------------------------------------------------------
4860 | Returns the result of adding the absolute values of the extended double-
4861 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4862 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4863 | The addition is performed according to the IEC/IEEE Standard for Binary
4864 | Floating-Point Arithmetic.
4865 *----------------------------------------------------------------------------*/
4866 
4867 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4868 {
4869     int32 aExp, bExp, zExp;
4870     uint64_t aSig, bSig, zSig0, zSig1;
4871     int32 expDiff;
4872 
4873     aSig = extractFloatx80Frac( a );
4874     aExp = extractFloatx80Exp( a );
4875     bSig = extractFloatx80Frac( b );
4876     bExp = extractFloatx80Exp( b );
4877     expDiff = aExp - bExp;
4878     if ( 0 < expDiff ) {
4879         if ( aExp == 0x7FFF ) {
4880             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4881             return a;
4882         }
4883         if ( bExp == 0 ) --expDiff;
4884         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4885         zExp = aExp;
4886     }
4887     else if ( expDiff < 0 ) {
4888         if ( bExp == 0x7FFF ) {
4889             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4890             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4891         }
4892         if ( aExp == 0 ) ++expDiff;
4893         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4894         zExp = bExp;
4895     }
4896     else {
4897         if ( aExp == 0x7FFF ) {
4898             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4899                 return propagateFloatx80NaN( a, b STATUS_VAR );
4900             }
4901             return a;
4902         }
4903         zSig1 = 0;
4904         zSig0 = aSig + bSig;
4905         if ( aExp == 0 ) {
4906             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4907             goto roundAndPack;
4908         }
4909         zExp = aExp;
4910         goto shiftRight1;
4911     }
4912     zSig0 = aSig + bSig;
4913     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4914  shiftRight1:
4915     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4916     zSig0 |= LIT64( 0x8000000000000000 );
4917     ++zExp;
4918  roundAndPack:
4919     return
4920         roundAndPackFloatx80(
4921             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4922 
4923 }
4924 
4925 /*----------------------------------------------------------------------------
4926 | Returns the result of subtracting the absolute values of the extended
4927 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4928 | difference is negated before being returned.  `zSign' is ignored if the
4929 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4930 | Standard for Binary Floating-Point Arithmetic.
4931 *----------------------------------------------------------------------------*/
4932 
4933 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4934 {
4935     int32 aExp, bExp, zExp;
4936     uint64_t aSig, bSig, zSig0, zSig1;
4937     int32 expDiff;
4938     floatx80 z;
4939 
4940     aSig = extractFloatx80Frac( a );
4941     aExp = extractFloatx80Exp( a );
4942     bSig = extractFloatx80Frac( b );
4943     bExp = extractFloatx80Exp( b );
4944     expDiff = aExp - bExp;
4945     if ( 0 < expDiff ) goto aExpBigger;
4946     if ( expDiff < 0 ) goto bExpBigger;
4947     if ( aExp == 0x7FFF ) {
4948         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4949             return propagateFloatx80NaN( a, b STATUS_VAR );
4950         }
4951         float_raise( float_flag_invalid STATUS_VAR);
4952         z.low = floatx80_default_nan_low;
4953         z.high = floatx80_default_nan_high;
4954         return z;
4955     }
4956     if ( aExp == 0 ) {
4957         aExp = 1;
4958         bExp = 1;
4959     }
4960     zSig1 = 0;
4961     if ( bSig < aSig ) goto aBigger;
4962     if ( aSig < bSig ) goto bBigger;
4963     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4964  bExpBigger:
4965     if ( bExp == 0x7FFF ) {
4966         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4967         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4968     }
4969     if ( aExp == 0 ) ++expDiff;
4970     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4971  bBigger:
4972     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4973     zExp = bExp;
4974     zSign ^= 1;
4975     goto normalizeRoundAndPack;
4976  aExpBigger:
4977     if ( aExp == 0x7FFF ) {
4978         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4979         return a;
4980     }
4981     if ( bExp == 0 ) --expDiff;
4982     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4983  aBigger:
4984     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4985     zExp = aExp;
4986  normalizeRoundAndPack:
4987     return
4988         normalizeRoundAndPackFloatx80(
4989             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4990 
4991 }
4992 
4993 /*----------------------------------------------------------------------------
4994 | Returns the result of adding the extended double-precision floating-point
4995 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4996 | Standard for Binary Floating-Point Arithmetic.
4997 *----------------------------------------------------------------------------*/
4998 
4999 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5000 {
5001     flag aSign, bSign;
5002 
5003     aSign = extractFloatx80Sign( a );
5004     bSign = extractFloatx80Sign( b );
5005     if ( aSign == bSign ) {
5006         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5007     }
5008     else {
5009         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5010     }
5011 
5012 }
5013 
5014 /*----------------------------------------------------------------------------
5015 | Returns the result of subtracting the extended double-precision floating-
5016 | point values `a' and `b'.  The operation is performed according to the
5017 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5018 *----------------------------------------------------------------------------*/
5019 
5020 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5021 {
5022     flag aSign, bSign;
5023 
5024     aSign = extractFloatx80Sign( a );
5025     bSign = extractFloatx80Sign( b );
5026     if ( aSign == bSign ) {
5027         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5028     }
5029     else {
5030         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5031     }
5032 
5033 }
5034 
5035 /*----------------------------------------------------------------------------
5036 | Returns the result of multiplying the extended double-precision floating-
5037 | point values `a' and `b'.  The operation is performed according to the
5038 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5040 
5041 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5042 {
5043     flag aSign, bSign, zSign;
5044     int32 aExp, bExp, zExp;
5045     uint64_t aSig, bSig, zSig0, zSig1;
5046     floatx80 z;
5047 
5048     aSig = extractFloatx80Frac( a );
5049     aExp = extractFloatx80Exp( a );
5050     aSign = extractFloatx80Sign( a );
5051     bSig = extractFloatx80Frac( b );
5052     bExp = extractFloatx80Exp( b );
5053     bSign = extractFloatx80Sign( b );
5054     zSign = aSign ^ bSign;
5055     if ( aExp == 0x7FFF ) {
5056         if (    (uint64_t) ( aSig<<1 )
5057              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5058             return propagateFloatx80NaN( a, b STATUS_VAR );
5059         }
5060         if ( ( bExp | bSig ) == 0 ) goto invalid;
5061         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5062     }
5063     if ( bExp == 0x7FFF ) {
5064         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5065         if ( ( aExp | aSig ) == 0 ) {
5066  invalid:
5067             float_raise( float_flag_invalid STATUS_VAR);
5068             z.low = floatx80_default_nan_low;
5069             z.high = floatx80_default_nan_high;
5070             return z;
5071         }
5072         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5073     }
5074     if ( aExp == 0 ) {
5075         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5076         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5077     }
5078     if ( bExp == 0 ) {
5079         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5080         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5081     }
5082     zExp = aExp + bExp - 0x3FFE;
5083     mul64To128( aSig, bSig, &zSig0, &zSig1 );
5084     if ( 0 < (int64_t) zSig0 ) {
5085         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5086         --zExp;
5087     }
5088     return
5089         roundAndPackFloatx80(
5090             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5091 
5092 }
5093 
5094 /*----------------------------------------------------------------------------
5095 | Returns the result of dividing the extended double-precision floating-point
5096 | value `a' by the corresponding value `b'.  The operation is performed
5097 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5098 *----------------------------------------------------------------------------*/
5099 
5100 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5101 {
5102     flag aSign, bSign, zSign;
5103     int32 aExp, bExp, zExp;
5104     uint64_t aSig, bSig, zSig0, zSig1;
5105     uint64_t rem0, rem1, rem2, term0, term1, term2;
5106     floatx80 z;
5107 
5108     aSig = extractFloatx80Frac( a );
5109     aExp = extractFloatx80Exp( a );
5110     aSign = extractFloatx80Sign( a );
5111     bSig = extractFloatx80Frac( b );
5112     bExp = extractFloatx80Exp( b );
5113     bSign = extractFloatx80Sign( b );
5114     zSign = aSign ^ bSign;
5115     if ( aExp == 0x7FFF ) {
5116         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5117         if ( bExp == 0x7FFF ) {
5118             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5119             goto invalid;
5120         }
5121         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5122     }
5123     if ( bExp == 0x7FFF ) {
5124         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5125         return packFloatx80( zSign, 0, 0 );
5126     }
5127     if ( bExp == 0 ) {
5128         if ( bSig == 0 ) {
5129             if ( ( aExp | aSig ) == 0 ) {
5130  invalid:
5131                 float_raise( float_flag_invalid STATUS_VAR);
5132                 z.low = floatx80_default_nan_low;
5133                 z.high = floatx80_default_nan_high;
5134                 return z;
5135             }
5136             float_raise( float_flag_divbyzero STATUS_VAR);
5137             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5138         }
5139         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5140     }
5141     if ( aExp == 0 ) {
5142         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5143         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5144     }
5145     zExp = aExp - bExp + 0x3FFE;
5146     rem1 = 0;
5147     if ( bSig <= aSig ) {
5148         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5149         ++zExp;
5150     }
5151     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5152     mul64To128( bSig, zSig0, &term0, &term1 );
5153     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5154     while ( (int64_t) rem0 < 0 ) {
5155         --zSig0;
5156         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5157     }
5158     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5159     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5160         mul64To128( bSig, zSig1, &term1, &term2 );
5161         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5162         while ( (int64_t) rem1 < 0 ) {
5163             --zSig1;
5164             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5165         }
5166         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5167     }
5168     return
5169         roundAndPackFloatx80(
5170             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5171 
5172 }
5173 
5174 /*----------------------------------------------------------------------------
5175 | Returns the remainder of the extended double-precision floating-point value
5176 | `a' with respect to the corresponding value `b'.  The operation is performed
5177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5178 *----------------------------------------------------------------------------*/
5179 
5180 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5181 {
5182     flag aSign, zSign;
5183     int32 aExp, bExp, expDiff;
5184     uint64_t aSig0, aSig1, bSig;
5185     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5186     floatx80 z;
5187 
5188     aSig0 = extractFloatx80Frac( a );
5189     aExp = extractFloatx80Exp( a );
5190     aSign = extractFloatx80Sign( a );
5191     bSig = extractFloatx80Frac( b );
5192     bExp = extractFloatx80Exp( b );
5193     if ( aExp == 0x7FFF ) {
5194         if (    (uint64_t) ( aSig0<<1 )
5195              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5196             return propagateFloatx80NaN( a, b STATUS_VAR );
5197         }
5198         goto invalid;
5199     }
5200     if ( bExp == 0x7FFF ) {
5201         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5202         return a;
5203     }
5204     if ( bExp == 0 ) {
5205         if ( bSig == 0 ) {
5206  invalid:
5207             float_raise( float_flag_invalid STATUS_VAR);
5208             z.low = floatx80_default_nan_low;
5209             z.high = floatx80_default_nan_high;
5210             return z;
5211         }
5212         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5213     }
5214     if ( aExp == 0 ) {
5215         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5216         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5217     }
5218     bSig |= LIT64( 0x8000000000000000 );
5219     zSign = aSign;
5220     expDiff = aExp - bExp;
5221     aSig1 = 0;
5222     if ( expDiff < 0 ) {
5223         if ( expDiff < -1 ) return a;
5224         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5225         expDiff = 0;
5226     }
5227     q = ( bSig <= aSig0 );
5228     if ( q ) aSig0 -= bSig;
5229     expDiff -= 64;
5230     while ( 0 < expDiff ) {
5231         q = estimateDiv128To64( aSig0, aSig1, bSig );
5232         q = ( 2 < q ) ? q - 2 : 0;
5233         mul64To128( bSig, q, &term0, &term1 );
5234         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5235         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5236         expDiff -= 62;
5237     }
5238     expDiff += 64;
5239     if ( 0 < expDiff ) {
5240         q = estimateDiv128To64( aSig0, aSig1, bSig );
5241         q = ( 2 < q ) ? q - 2 : 0;
5242         q >>= 64 - expDiff;
5243         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5244         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5245         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5246         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5247             ++q;
5248             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5249         }
5250     }
5251     else {
5252         term1 = 0;
5253         term0 = bSig;
5254     }
5255     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5256     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5257          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5258               && ( q & 1 ) )
5259        ) {
5260         aSig0 = alternateASig0;
5261         aSig1 = alternateASig1;
5262         zSign = ! zSign;
5263     }
5264     return
5265         normalizeRoundAndPackFloatx80(
5266             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5267 
5268 }
5269 
5270 /*----------------------------------------------------------------------------
5271 | Returns the square root of the extended double-precision floating-point
5272 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5273 | for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5275 
5276 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5277 {
5278     flag aSign;
5279     int32 aExp, zExp;
5280     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5281     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5282     floatx80 z;
5283 
5284     aSig0 = extractFloatx80Frac( a );
5285     aExp = extractFloatx80Exp( a );
5286     aSign = extractFloatx80Sign( a );
5287     if ( aExp == 0x7FFF ) {
5288         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5289         if ( ! aSign ) return a;
5290         goto invalid;
5291     }
5292     if ( aSign ) {
5293         if ( ( aExp | aSig0 ) == 0 ) return a;
5294  invalid:
5295         float_raise( float_flag_invalid STATUS_VAR);
5296         z.low = floatx80_default_nan_low;
5297         z.high = floatx80_default_nan_high;
5298         return z;
5299     }
5300     if ( aExp == 0 ) {
5301         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5302         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5303     }
5304     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5305     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5306     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5307     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5308     doubleZSig0 = zSig0<<1;
5309     mul64To128( zSig0, zSig0, &term0, &term1 );
5310     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5311     while ( (int64_t) rem0 < 0 ) {
5312         --zSig0;
5313         doubleZSig0 -= 2;
5314         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5315     }
5316     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5317     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5318         if ( zSig1 == 0 ) zSig1 = 1;
5319         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5320         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5321         mul64To128( zSig1, zSig1, &term2, &term3 );
5322         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5323         while ( (int64_t) rem1 < 0 ) {
5324             --zSig1;
5325             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5326             term3 |= 1;
5327             term2 |= doubleZSig0;
5328             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5329         }
5330         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5331     }
5332     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5333     zSig0 |= doubleZSig0;
5334     return
5335         roundAndPackFloatx80(
5336             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5337 
5338 }
5339 
5340 /*----------------------------------------------------------------------------
5341 | Returns 1 if the extended double-precision floating-point value `a' is equal
5342 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5343 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5345 *----------------------------------------------------------------------------*/
5346 
5347 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5348 {
5349 
5350     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5351               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5352          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5353               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5354        ) {
5355         float_raise( float_flag_invalid STATUS_VAR);
5356         return 0;
5357     }
5358     return
5359            ( a.low == b.low )
5360         && (    ( a.high == b.high )
5361              || (    ( a.low == 0 )
5362                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5363            );
5364 
5365 }
5366 
5367 /*----------------------------------------------------------------------------
5368 | Returns 1 if the extended double-precision floating-point value `a' is
5369 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5370 | invalid exception is raised if either operand is a NaN.  The comparison is
5371 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5372 | Arithmetic.
5373 *----------------------------------------------------------------------------*/
5374 
5375 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5376 {
5377     flag aSign, bSign;
5378 
5379     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5380               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5381          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5382               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5383        ) {
5384         float_raise( float_flag_invalid STATUS_VAR);
5385         return 0;
5386     }
5387     aSign = extractFloatx80Sign( a );
5388     bSign = extractFloatx80Sign( b );
5389     if ( aSign != bSign ) {
5390         return
5391                aSign
5392             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5393                  == 0 );
5394     }
5395     return
5396           aSign ? le128( b.high, b.low, a.high, a.low )
5397         : le128( a.high, a.low, b.high, b.low );
5398 
5399 }
5400 
5401 /*----------------------------------------------------------------------------
5402 | Returns 1 if the extended double-precision floating-point value `a' is
5403 | less than the corresponding value `b', and 0 otherwise.  The invalid
5404 | exception is raised if either operand is a NaN.  The comparison is performed
5405 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5406 *----------------------------------------------------------------------------*/
5407 
5408 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5409 {
5410     flag aSign, bSign;
5411 
5412     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5413               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5414          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5415               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5416        ) {
5417         float_raise( float_flag_invalid STATUS_VAR);
5418         return 0;
5419     }
5420     aSign = extractFloatx80Sign( a );
5421     bSign = extractFloatx80Sign( b );
5422     if ( aSign != bSign ) {
5423         return
5424                aSign
5425             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5426                  != 0 );
5427     }
5428     return
5429           aSign ? lt128( b.high, b.low, a.high, a.low )
5430         : lt128( a.high, a.low, b.high, b.low );
5431 
5432 }
5433 
5434 /*----------------------------------------------------------------------------
5435 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5436 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5437 | either operand is a NaN.   The comparison is performed according to the
5438 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5439 *----------------------------------------------------------------------------*/
5440 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5441 {
5442     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5443               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5444          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5445               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5446        ) {
5447         float_raise( float_flag_invalid STATUS_VAR);
5448         return 1;
5449     }
5450     return 0;
5451 }
5452 
5453 /*----------------------------------------------------------------------------
5454 | Returns 1 if the extended double-precision floating-point value `a' is
5455 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5456 | cause an exception.  The comparison is performed according to the IEC/IEEE
5457 | Standard for Binary Floating-Point Arithmetic.
5458 *----------------------------------------------------------------------------*/
5459 
5460 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5461 {
5462 
5463     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5464               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5465          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5466               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5467        ) {
5468         if (    floatx80_is_signaling_nan( a )
5469              || floatx80_is_signaling_nan( b ) ) {
5470             float_raise( float_flag_invalid STATUS_VAR);
5471         }
5472         return 0;
5473     }
5474     return
5475            ( a.low == b.low )
5476         && (    ( a.high == b.high )
5477              || (    ( a.low == 0 )
5478                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5479            );
5480 
5481 }
5482 
5483 /*----------------------------------------------------------------------------
5484 | Returns 1 if the extended double-precision floating-point value `a' is less
5485 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5486 | do not cause an exception.  Otherwise, the comparison is performed according
5487 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5488 *----------------------------------------------------------------------------*/
5489 
5490 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5491 {
5492     flag aSign, bSign;
5493 
5494     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5495               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5496          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5497               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5498        ) {
5499         if (    floatx80_is_signaling_nan( a )
5500              || floatx80_is_signaling_nan( b ) ) {
5501             float_raise( float_flag_invalid STATUS_VAR);
5502         }
5503         return 0;
5504     }
5505     aSign = extractFloatx80Sign( a );
5506     bSign = extractFloatx80Sign( b );
5507     if ( aSign != bSign ) {
5508         return
5509                aSign
5510             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5511                  == 0 );
5512     }
5513     return
5514           aSign ? le128( b.high, b.low, a.high, a.low )
5515         : le128( a.high, a.low, b.high, b.low );
5516 
5517 }
5518 
5519 /*----------------------------------------------------------------------------
5520 | Returns 1 if the extended double-precision floating-point value `a' is less
5521 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5522 | an exception.  Otherwise, the comparison is performed according to the
5523 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5524 *----------------------------------------------------------------------------*/
5525 
5526 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5527 {
5528     flag aSign, bSign;
5529 
5530     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5531               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5532          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5533               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5534        ) {
5535         if (    floatx80_is_signaling_nan( a )
5536              || floatx80_is_signaling_nan( b ) ) {
5537             float_raise( float_flag_invalid STATUS_VAR);
5538         }
5539         return 0;
5540     }
5541     aSign = extractFloatx80Sign( a );
5542     bSign = extractFloatx80Sign( b );
5543     if ( aSign != bSign ) {
5544         return
5545                aSign
5546             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5547                  != 0 );
5548     }
5549     return
5550           aSign ? lt128( b.high, b.low, a.high, a.low )
5551         : lt128( a.high, a.low, b.high, b.low );
5552 
5553 }
5554 
5555 /*----------------------------------------------------------------------------
5556 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5557 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5558 | The comparison is performed according to the IEC/IEEE Standard for Binary
5559 | Floating-Point Arithmetic.
5560 *----------------------------------------------------------------------------*/
5561 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5562 {
5563     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5564               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5565          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5566               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5567        ) {
5568         if (    floatx80_is_signaling_nan( a )
5569              || floatx80_is_signaling_nan( b ) ) {
5570             float_raise( float_flag_invalid STATUS_VAR);
5571         }
5572         return 1;
5573     }
5574     return 0;
5575 }
5576 
5577 /*----------------------------------------------------------------------------
5578 | Returns the result of converting the quadruple-precision floating-point
5579 | value `a' to the 32-bit two's complement integer format.  The conversion
5580 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5581 | Arithmetic---which means in particular that the conversion is rounded
5582 | according to the current rounding mode.  If `a' is a NaN, the largest
5583 | positive integer is returned.  Otherwise, if the conversion overflows, the
5584 | largest integer with the same sign as `a' is returned.
5585 *----------------------------------------------------------------------------*/
5586 
5587 int32 float128_to_int32( float128 a STATUS_PARAM )
5588 {
5589     flag aSign;
5590     int32 aExp, shiftCount;
5591     uint64_t aSig0, aSig1;
5592 
5593     aSig1 = extractFloat128Frac1( a );
5594     aSig0 = extractFloat128Frac0( a );
5595     aExp = extractFloat128Exp( a );
5596     aSign = extractFloat128Sign( a );
5597     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5598     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5599     aSig0 |= ( aSig1 != 0 );
5600     shiftCount = 0x4028 - aExp;
5601     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5602     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5603 
5604 }
5605 
5606 /*----------------------------------------------------------------------------
5607 | Returns the result of converting the quadruple-precision floating-point
5608 | value `a' to the 32-bit two's complement integer format.  The conversion
5609 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5610 | Arithmetic, except that the conversion is always rounded toward zero.  If
5611 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5612 | conversion overflows, the largest integer with the same sign as `a' is
5613 | returned.
5614 *----------------------------------------------------------------------------*/
5615 
5616 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5617 {
5618     flag aSign;
5619     int32 aExp, shiftCount;
5620     uint64_t aSig0, aSig1, savedASig;
5621     int32_t z;
5622 
5623     aSig1 = extractFloat128Frac1( a );
5624     aSig0 = extractFloat128Frac0( a );
5625     aExp = extractFloat128Exp( a );
5626     aSign = extractFloat128Sign( a );
5627     aSig0 |= ( aSig1 != 0 );
5628     if ( 0x401E < aExp ) {
5629         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5630         goto invalid;
5631     }
5632     else if ( aExp < 0x3FFF ) {
5633         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5634         return 0;
5635     }
5636     aSig0 |= LIT64( 0x0001000000000000 );
5637     shiftCount = 0x402F - aExp;
5638     savedASig = aSig0;
5639     aSig0 >>= shiftCount;
5640     z = aSig0;
5641     if ( aSign ) z = - z;
5642     if ( ( z < 0 ) ^ aSign ) {
5643  invalid:
5644         float_raise( float_flag_invalid STATUS_VAR);
5645         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5646     }
5647     if ( ( aSig0<<shiftCount ) != savedASig ) {
5648         STATUS(float_exception_flags) |= float_flag_inexact;
5649     }
5650     return z;
5651 
5652 }
5653 
5654 /*----------------------------------------------------------------------------
5655 | Returns the result of converting the quadruple-precision floating-point
5656 | value `a' to the 64-bit two's complement integer format.  The conversion
5657 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5658 | Arithmetic---which means in particular that the conversion is rounded
5659 | according to the current rounding mode.  If `a' is a NaN, the largest
5660 | positive integer is returned.  Otherwise, if the conversion overflows, the
5661 | largest integer with the same sign as `a' is returned.
5662 *----------------------------------------------------------------------------*/
5663 
5664 int64 float128_to_int64( float128 a STATUS_PARAM )
5665 {
5666     flag aSign;
5667     int32 aExp, shiftCount;
5668     uint64_t aSig0, aSig1;
5669 
5670     aSig1 = extractFloat128Frac1( a );
5671     aSig0 = extractFloat128Frac0( a );
5672     aExp = extractFloat128Exp( a );
5673     aSign = extractFloat128Sign( a );
5674     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5675     shiftCount = 0x402F - aExp;
5676     if ( shiftCount <= 0 ) {
5677         if ( 0x403E < aExp ) {
5678             float_raise( float_flag_invalid STATUS_VAR);
5679             if (    ! aSign
5680                  || (    ( aExp == 0x7FFF )
5681                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5682                     )
5683                ) {
5684                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5685             }
5686             return (int64_t) LIT64( 0x8000000000000000 );
5687         }
5688         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5689     }
5690     else {
5691         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5692     }
5693     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5694 
5695 }
5696 
5697 /*----------------------------------------------------------------------------
5698 | Returns the result of converting the quadruple-precision floating-point
5699 | value `a' to the 64-bit two's complement integer format.  The conversion
5700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5701 | Arithmetic, except that the conversion is always rounded toward zero.
5702 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5703 | the conversion overflows, the largest integer with the same sign as `a' is
5704 | returned.
5705 *----------------------------------------------------------------------------*/
5706 
5707 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5708 {
5709     flag aSign;
5710     int32 aExp, shiftCount;
5711     uint64_t aSig0, aSig1;
5712     int64 z;
5713 
5714     aSig1 = extractFloat128Frac1( a );
5715     aSig0 = extractFloat128Frac0( a );
5716     aExp = extractFloat128Exp( a );
5717     aSign = extractFloat128Sign( a );
5718     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5719     shiftCount = aExp - 0x402F;
5720     if ( 0 < shiftCount ) {
5721         if ( 0x403E <= aExp ) {
5722             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5723             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5724                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5725                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5726             }
5727             else {
5728                 float_raise( float_flag_invalid STATUS_VAR);
5729                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5730                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5731                 }
5732             }
5733             return (int64_t) LIT64( 0x8000000000000000 );
5734         }
5735         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5736         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5737             STATUS(float_exception_flags) |= float_flag_inexact;
5738         }
5739     }
5740     else {
5741         if ( aExp < 0x3FFF ) {
5742             if ( aExp | aSig0 | aSig1 ) {
5743                 STATUS(float_exception_flags) |= float_flag_inexact;
5744             }
5745             return 0;
5746         }
5747         z = aSig0>>( - shiftCount );
5748         if (    aSig1
5749              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5750             STATUS(float_exception_flags) |= float_flag_inexact;
5751         }
5752     }
5753     if ( aSign ) z = - z;
5754     return z;
5755 
5756 }
5757 
5758 /*----------------------------------------------------------------------------
5759 | Returns the result of converting the quadruple-precision floating-point
5760 | value `a' to the single-precision floating-point format.  The conversion
5761 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5762 | Arithmetic.
5763 *----------------------------------------------------------------------------*/
5764 
5765 float32 float128_to_float32( float128 a STATUS_PARAM )
5766 {
5767     flag aSign;
5768     int32 aExp;
5769     uint64_t aSig0, aSig1;
5770     uint32_t zSig;
5771 
5772     aSig1 = extractFloat128Frac1( a );
5773     aSig0 = extractFloat128Frac0( a );
5774     aExp = extractFloat128Exp( a );
5775     aSign = extractFloat128Sign( a );
5776     if ( aExp == 0x7FFF ) {
5777         if ( aSig0 | aSig1 ) {
5778             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5779         }
5780         return packFloat32( aSign, 0xFF, 0 );
5781     }
5782     aSig0 |= ( aSig1 != 0 );
5783     shift64RightJamming( aSig0, 18, &aSig0 );
5784     zSig = aSig0;
5785     if ( aExp || zSig ) {
5786         zSig |= 0x40000000;
5787         aExp -= 0x3F81;
5788     }
5789     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5790 
5791 }
5792 
5793 /*----------------------------------------------------------------------------
5794 | Returns the result of converting the quadruple-precision floating-point
5795 | value `a' to the double-precision floating-point format.  The conversion
5796 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5797 | Arithmetic.
5798 *----------------------------------------------------------------------------*/
5799 
5800 float64 float128_to_float64( float128 a STATUS_PARAM )
5801 {
5802     flag aSign;
5803     int32 aExp;
5804     uint64_t aSig0, aSig1;
5805 
5806     aSig1 = extractFloat128Frac1( a );
5807     aSig0 = extractFloat128Frac0( a );
5808     aExp = extractFloat128Exp( a );
5809     aSign = extractFloat128Sign( a );
5810     if ( aExp == 0x7FFF ) {
5811         if ( aSig0 | aSig1 ) {
5812             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5813         }
5814         return packFloat64( aSign, 0x7FF, 0 );
5815     }
5816     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5817     aSig0 |= ( aSig1 != 0 );
5818     if ( aExp || aSig0 ) {
5819         aSig0 |= LIT64( 0x4000000000000000 );
5820         aExp -= 0x3C01;
5821     }
5822     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5823 
5824 }
5825 
5826 /*----------------------------------------------------------------------------
5827 | Returns the result of converting the quadruple-precision floating-point
5828 | value `a' to the extended double-precision floating-point format.  The
5829 | conversion is performed according to the IEC/IEEE Standard for Binary
5830 | Floating-Point Arithmetic.
5831 *----------------------------------------------------------------------------*/
5832 
5833 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5834 {
5835     flag aSign;
5836     int32 aExp;
5837     uint64_t aSig0, aSig1;
5838 
5839     aSig1 = extractFloat128Frac1( a );
5840     aSig0 = extractFloat128Frac0( a );
5841     aExp = extractFloat128Exp( a );
5842     aSign = extractFloat128Sign( a );
5843     if ( aExp == 0x7FFF ) {
5844         if ( aSig0 | aSig1 ) {
5845             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5846         }
5847         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5848     }
5849     if ( aExp == 0 ) {
5850         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5851         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5852     }
5853     else {
5854         aSig0 |= LIT64( 0x0001000000000000 );
5855     }
5856     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5857     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5858 
5859 }
5860 
5861 /*----------------------------------------------------------------------------
5862 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5863 | returns the result as a quadruple-precision floating-point value.  The
5864 | operation is performed according to the IEC/IEEE Standard for Binary
5865 | Floating-Point Arithmetic.
5866 *----------------------------------------------------------------------------*/
5867 
5868 float128 float128_round_to_int( float128 a STATUS_PARAM )
5869 {
5870     flag aSign;
5871     int32 aExp;
5872     uint64_t lastBitMask, roundBitsMask;
5873     float128 z;
5874 
5875     aExp = extractFloat128Exp( a );
5876     if ( 0x402F <= aExp ) {
5877         if ( 0x406F <= aExp ) {
5878             if (    ( aExp == 0x7FFF )
5879                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5880                ) {
5881                 return propagateFloat128NaN( a, a STATUS_VAR );
5882             }
5883             return a;
5884         }
5885         lastBitMask = 1;
5886         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5887         roundBitsMask = lastBitMask - 1;
5888         z = a;
5889         switch (STATUS(float_rounding_mode)) {
5890         case float_round_nearest_even:
5891             if ( lastBitMask ) {
5892                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5893                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5894             }
5895             else {
5896                 if ( (int64_t) z.low < 0 ) {
5897                     ++z.high;
5898                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5899                 }
5900             }
5901             break;
5902         case float_round_ties_away:
5903             if (lastBitMask) {
5904                 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5905             } else {
5906                 if ((int64_t) z.low < 0) {
5907                     ++z.high;
5908                 }
5909             }
5910             break;
5911         case float_round_to_zero:
5912             break;
5913         case float_round_up:
5914             if (!extractFloat128Sign(z)) {
5915                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5916             }
5917             break;
5918         case float_round_down:
5919             if (extractFloat128Sign(z)) {
5920                 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
5921             }
5922             break;
5923         default:
5924             abort();
5925         }
5926         z.low &= ~ roundBitsMask;
5927     }
5928     else {
5929         if ( aExp < 0x3FFF ) {
5930             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5931             STATUS(float_exception_flags) |= float_flag_inexact;
5932             aSign = extractFloat128Sign( a );
5933             switch ( STATUS(float_rounding_mode) ) {
5934              case float_round_nearest_even:
5935                 if (    ( aExp == 0x3FFE )
5936                      && (   extractFloat128Frac0( a )
5937                           | extractFloat128Frac1( a ) )
5938                    ) {
5939                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5940                 }
5941                 break;
5942             case float_round_ties_away:
5943                 if (aExp == 0x3FFE) {
5944                     return packFloat128(aSign, 0x3FFF, 0, 0);
5945                 }
5946                 break;
5947              case float_round_down:
5948                 return
5949                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5950                     : packFloat128( 0, 0, 0, 0 );
5951              case float_round_up:
5952                 return
5953                       aSign ? packFloat128( 1, 0, 0, 0 )
5954                     : packFloat128( 0, 0x3FFF, 0, 0 );
5955             }
5956             return packFloat128( aSign, 0, 0, 0 );
5957         }
5958         lastBitMask = 1;
5959         lastBitMask <<= 0x402F - aExp;
5960         roundBitsMask = lastBitMask - 1;
5961         z.low = 0;
5962         z.high = a.high;
5963         switch (STATUS(float_rounding_mode)) {
5964         case float_round_nearest_even:
5965             z.high += lastBitMask>>1;
5966             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5967                 z.high &= ~ lastBitMask;
5968             }
5969             break;
5970         case float_round_ties_away:
5971             z.high += lastBitMask>>1;
5972             break;
5973         case float_round_to_zero:
5974             break;
5975         case float_round_up:
5976             if (!extractFloat128Sign(z)) {
5977                 z.high |= ( a.low != 0 );
5978                 z.high += roundBitsMask;
5979             }
5980             break;
5981         case float_round_down:
5982             if (extractFloat128Sign(z)) {
5983                 z.high |= (a.low != 0);
5984                 z.high += roundBitsMask;
5985             }
5986             break;
5987         default:
5988             abort();
5989         }
5990         z.high &= ~ roundBitsMask;
5991     }
5992     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5993         STATUS(float_exception_flags) |= float_flag_inexact;
5994     }
5995     return z;
5996 
5997 }
5998 
5999 /*----------------------------------------------------------------------------
6000 | Returns the result of adding the absolute values of the quadruple-precision
6001 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
6002 | before being returned.  `zSign' is ignored if the result is a NaN.
6003 | The addition is performed according to the IEC/IEEE Standard for Binary
6004 | Floating-Point Arithmetic.
6005 *----------------------------------------------------------------------------*/
6006 
6007 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6008 {
6009     int32 aExp, bExp, zExp;
6010     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6011     int32 expDiff;
6012 
6013     aSig1 = extractFloat128Frac1( a );
6014     aSig0 = extractFloat128Frac0( a );
6015     aExp = extractFloat128Exp( a );
6016     bSig1 = extractFloat128Frac1( b );
6017     bSig0 = extractFloat128Frac0( b );
6018     bExp = extractFloat128Exp( b );
6019     expDiff = aExp - bExp;
6020     if ( 0 < expDiff ) {
6021         if ( aExp == 0x7FFF ) {
6022             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6023             return a;
6024         }
6025         if ( bExp == 0 ) {
6026             --expDiff;
6027         }
6028         else {
6029             bSig0 |= LIT64( 0x0001000000000000 );
6030         }
6031         shift128ExtraRightJamming(
6032             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6033         zExp = aExp;
6034     }
6035     else if ( expDiff < 0 ) {
6036         if ( bExp == 0x7FFF ) {
6037             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6038             return packFloat128( zSign, 0x7FFF, 0, 0 );
6039         }
6040         if ( aExp == 0 ) {
6041             ++expDiff;
6042         }
6043         else {
6044             aSig0 |= LIT64( 0x0001000000000000 );
6045         }
6046         shift128ExtraRightJamming(
6047             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6048         zExp = bExp;
6049     }
6050     else {
6051         if ( aExp == 0x7FFF ) {
6052             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6053                 return propagateFloat128NaN( a, b STATUS_VAR );
6054             }
6055             return a;
6056         }
6057         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6058         if ( aExp == 0 ) {
6059             if (STATUS(flush_to_zero)) {
6060                 if (zSig0 | zSig1) {
6061                     float_raise(float_flag_output_denormal STATUS_VAR);
6062                 }
6063                 return packFloat128(zSign, 0, 0, 0);
6064             }
6065             return packFloat128( zSign, 0, zSig0, zSig1 );
6066         }
6067         zSig2 = 0;
6068         zSig0 |= LIT64( 0x0002000000000000 );
6069         zExp = aExp;
6070         goto shiftRight1;
6071     }
6072     aSig0 |= LIT64( 0x0001000000000000 );
6073     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6074     --zExp;
6075     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6076     ++zExp;
6077  shiftRight1:
6078     shift128ExtraRightJamming(
6079         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6080  roundAndPack:
6081     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6082 
6083 }
6084 
6085 /*----------------------------------------------------------------------------
6086 | Returns the result of subtracting the absolute values of the quadruple-
6087 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
6088 | difference is negated before being returned.  `zSign' is ignored if the
6089 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
6090 | Standard for Binary Floating-Point Arithmetic.
6091 *----------------------------------------------------------------------------*/
6092 
6093 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6094 {
6095     int32 aExp, bExp, zExp;
6096     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6097     int32 expDiff;
6098     float128 z;
6099 
6100     aSig1 = extractFloat128Frac1( a );
6101     aSig0 = extractFloat128Frac0( a );
6102     aExp = extractFloat128Exp( a );
6103     bSig1 = extractFloat128Frac1( b );
6104     bSig0 = extractFloat128Frac0( b );
6105     bExp = extractFloat128Exp( b );
6106     expDiff = aExp - bExp;
6107     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6108     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6109     if ( 0 < expDiff ) goto aExpBigger;
6110     if ( expDiff < 0 ) goto bExpBigger;
6111     if ( aExp == 0x7FFF ) {
6112         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6113             return propagateFloat128NaN( a, b STATUS_VAR );
6114         }
6115         float_raise( float_flag_invalid STATUS_VAR);
6116         z.low = float128_default_nan_low;
6117         z.high = float128_default_nan_high;
6118         return z;
6119     }
6120     if ( aExp == 0 ) {
6121         aExp = 1;
6122         bExp = 1;
6123     }
6124     if ( bSig0 < aSig0 ) goto aBigger;
6125     if ( aSig0 < bSig0 ) goto bBigger;
6126     if ( bSig1 < aSig1 ) goto aBigger;
6127     if ( aSig1 < bSig1 ) goto bBigger;
6128     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6129  bExpBigger:
6130     if ( bExp == 0x7FFF ) {
6131         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6132         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6133     }
6134     if ( aExp == 0 ) {
6135         ++expDiff;
6136     }
6137     else {
6138         aSig0 |= LIT64( 0x4000000000000000 );
6139     }
6140     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6141     bSig0 |= LIT64( 0x4000000000000000 );
6142  bBigger:
6143     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6144     zExp = bExp;
6145     zSign ^= 1;
6146     goto normalizeRoundAndPack;
6147  aExpBigger:
6148     if ( aExp == 0x7FFF ) {
6149         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6150         return a;
6151     }
6152     if ( bExp == 0 ) {
6153         --expDiff;
6154     }
6155     else {
6156         bSig0 |= LIT64( 0x4000000000000000 );
6157     }
6158     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6159     aSig0 |= LIT64( 0x4000000000000000 );
6160  aBigger:
6161     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6162     zExp = aExp;
6163  normalizeRoundAndPack:
6164     --zExp;
6165     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6166 
6167 }
6168 
6169 /*----------------------------------------------------------------------------
6170 | Returns the result of adding the quadruple-precision floating-point values
6171 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6172 | for Binary Floating-Point Arithmetic.
6173 *----------------------------------------------------------------------------*/
6174 
6175 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6176 {
6177     flag aSign, bSign;
6178 
6179     aSign = extractFloat128Sign( a );
6180     bSign = extractFloat128Sign( b );
6181     if ( aSign == bSign ) {
6182         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6183     }
6184     else {
6185         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6186     }
6187 
6188 }
6189 
6190 /*----------------------------------------------------------------------------
6191 | Returns the result of subtracting the quadruple-precision floating-point
6192 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6193 | Standard for Binary Floating-Point Arithmetic.
6194 *----------------------------------------------------------------------------*/
6195 
6196 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6197 {
6198     flag aSign, bSign;
6199 
6200     aSign = extractFloat128Sign( a );
6201     bSign = extractFloat128Sign( b );
6202     if ( aSign == bSign ) {
6203         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6204     }
6205     else {
6206         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6207     }
6208 
6209 }
6210 
6211 /*----------------------------------------------------------------------------
6212 | Returns the result of multiplying the quadruple-precision floating-point
6213 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6214 | Standard for Binary Floating-Point Arithmetic.
6215 *----------------------------------------------------------------------------*/
6216 
6217 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6218 {
6219     flag aSign, bSign, zSign;
6220     int32 aExp, bExp, zExp;
6221     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6222     float128 z;
6223 
6224     aSig1 = extractFloat128Frac1( a );
6225     aSig0 = extractFloat128Frac0( a );
6226     aExp = extractFloat128Exp( a );
6227     aSign = extractFloat128Sign( a );
6228     bSig1 = extractFloat128Frac1( b );
6229     bSig0 = extractFloat128Frac0( b );
6230     bExp = extractFloat128Exp( b );
6231     bSign = extractFloat128Sign( b );
6232     zSign = aSign ^ bSign;
6233     if ( aExp == 0x7FFF ) {
6234         if (    ( aSig0 | aSig1 )
6235              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6236             return propagateFloat128NaN( a, b STATUS_VAR );
6237         }
6238         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6239         return packFloat128( zSign, 0x7FFF, 0, 0 );
6240     }
6241     if ( bExp == 0x7FFF ) {
6242         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6243         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6244  invalid:
6245             float_raise( float_flag_invalid STATUS_VAR);
6246             z.low = float128_default_nan_low;
6247             z.high = float128_default_nan_high;
6248             return z;
6249         }
6250         return packFloat128( zSign, 0x7FFF, 0, 0 );
6251     }
6252     if ( aExp == 0 ) {
6253         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6254         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6255     }
6256     if ( bExp == 0 ) {
6257         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6258         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6259     }
6260     zExp = aExp + bExp - 0x4000;
6261     aSig0 |= LIT64( 0x0001000000000000 );
6262     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6263     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6264     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6265     zSig2 |= ( zSig3 != 0 );
6266     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6267         shift128ExtraRightJamming(
6268             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6269         ++zExp;
6270     }
6271     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6272 
6273 }
6274 
6275 /*----------------------------------------------------------------------------
6276 | Returns the result of dividing the quadruple-precision floating-point value
6277 | `a' by the corresponding value `b'.  The operation is performed according to
6278 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6280 
6281 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6282 {
6283     flag aSign, bSign, zSign;
6284     int32 aExp, bExp, zExp;
6285     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6286     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6287     float128 z;
6288 
6289     aSig1 = extractFloat128Frac1( a );
6290     aSig0 = extractFloat128Frac0( a );
6291     aExp = extractFloat128Exp( a );
6292     aSign = extractFloat128Sign( a );
6293     bSig1 = extractFloat128Frac1( b );
6294     bSig0 = extractFloat128Frac0( b );
6295     bExp = extractFloat128Exp( b );
6296     bSign = extractFloat128Sign( b );
6297     zSign = aSign ^ bSign;
6298     if ( aExp == 0x7FFF ) {
6299         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6300         if ( bExp == 0x7FFF ) {
6301             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6302             goto invalid;
6303         }
6304         return packFloat128( zSign, 0x7FFF, 0, 0 );
6305     }
6306     if ( bExp == 0x7FFF ) {
6307         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6308         return packFloat128( zSign, 0, 0, 0 );
6309     }
6310     if ( bExp == 0 ) {
6311         if ( ( bSig0 | bSig1 ) == 0 ) {
6312             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6313  invalid:
6314                 float_raise( float_flag_invalid STATUS_VAR);
6315                 z.low = float128_default_nan_low;
6316                 z.high = float128_default_nan_high;
6317                 return z;
6318             }
6319             float_raise( float_flag_divbyzero STATUS_VAR);
6320             return packFloat128( zSign, 0x7FFF, 0, 0 );
6321         }
6322         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6323     }
6324     if ( aExp == 0 ) {
6325         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6326         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6327     }
6328     zExp = aExp - bExp + 0x3FFD;
6329     shortShift128Left(
6330         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6331     shortShift128Left(
6332         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6333     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6334         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6335         ++zExp;
6336     }
6337     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6338     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6339     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6340     while ( (int64_t) rem0 < 0 ) {
6341         --zSig0;
6342         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6343     }
6344     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6345     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6346         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6347         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6348         while ( (int64_t) rem1 < 0 ) {
6349             --zSig1;
6350             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6351         }
6352         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6353     }
6354     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6355     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6356 
6357 }
6358 
6359 /*----------------------------------------------------------------------------
6360 | Returns the remainder of the quadruple-precision floating-point value `a'
6361 | with respect to the corresponding value `b'.  The operation is performed
6362 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6363 *----------------------------------------------------------------------------*/
6364 
6365 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6366 {
6367     flag aSign, zSign;
6368     int32 aExp, bExp, expDiff;
6369     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6370     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6371     int64_t sigMean0;
6372     float128 z;
6373 
6374     aSig1 = extractFloat128Frac1( a );
6375     aSig0 = extractFloat128Frac0( a );
6376     aExp = extractFloat128Exp( a );
6377     aSign = extractFloat128Sign( a );
6378     bSig1 = extractFloat128Frac1( b );
6379     bSig0 = extractFloat128Frac0( b );
6380     bExp = extractFloat128Exp( b );
6381     if ( aExp == 0x7FFF ) {
6382         if (    ( aSig0 | aSig1 )
6383              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6384             return propagateFloat128NaN( a, b STATUS_VAR );
6385         }
6386         goto invalid;
6387     }
6388     if ( bExp == 0x7FFF ) {
6389         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6390         return a;
6391     }
6392     if ( bExp == 0 ) {
6393         if ( ( bSig0 | bSig1 ) == 0 ) {
6394  invalid:
6395             float_raise( float_flag_invalid STATUS_VAR);
6396             z.low = float128_default_nan_low;
6397             z.high = float128_default_nan_high;
6398             return z;
6399         }
6400         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6401     }
6402     if ( aExp == 0 ) {
6403         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6404         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6405     }
6406     expDiff = aExp - bExp;
6407     if ( expDiff < -1 ) return a;
6408     shortShift128Left(
6409         aSig0 | LIT64( 0x0001000000000000 ),
6410         aSig1,
6411         15 - ( expDiff < 0 ),
6412         &aSig0,
6413         &aSig1
6414     );
6415     shortShift128Left(
6416         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6417     q = le128( bSig0, bSig1, aSig0, aSig1 );
6418     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6419     expDiff -= 64;
6420     while ( 0 < expDiff ) {
6421         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6422         q = ( 4 < q ) ? q - 4 : 0;
6423         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6424         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6425         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6426         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6427         expDiff -= 61;
6428     }
6429     if ( -64 < expDiff ) {
6430         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6431         q = ( 4 < q ) ? q - 4 : 0;
6432         q >>= - expDiff;
6433         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6434         expDiff += 52;
6435         if ( expDiff < 0 ) {
6436             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6437         }
6438         else {
6439             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6440         }
6441         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6442         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6443     }
6444     else {
6445         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6446         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6447     }
6448     do {
6449         alternateASig0 = aSig0;
6450         alternateASig1 = aSig1;
6451         ++q;
6452         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6453     } while ( 0 <= (int64_t) aSig0 );
6454     add128(
6455         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6456     if (    ( sigMean0 < 0 )
6457          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6458         aSig0 = alternateASig0;
6459         aSig1 = alternateASig1;
6460     }
6461     zSign = ( (int64_t) aSig0 < 0 );
6462     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6463     return
6464         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6465 
6466 }
6467 
6468 /*----------------------------------------------------------------------------
6469 | Returns the square root of the quadruple-precision floating-point value `a'.
6470 | The operation is performed according to the IEC/IEEE Standard for Binary
6471 | Floating-Point Arithmetic.
6472 *----------------------------------------------------------------------------*/
6473 
6474 float128 float128_sqrt( float128 a STATUS_PARAM )
6475 {
6476     flag aSign;
6477     int32 aExp, zExp;
6478     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6479     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6480     float128 z;
6481 
6482     aSig1 = extractFloat128Frac1( a );
6483     aSig0 = extractFloat128Frac0( a );
6484     aExp = extractFloat128Exp( a );
6485     aSign = extractFloat128Sign( a );
6486     if ( aExp == 0x7FFF ) {
6487         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6488         if ( ! aSign ) return a;
6489         goto invalid;
6490     }
6491     if ( aSign ) {
6492         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6493  invalid:
6494         float_raise( float_flag_invalid STATUS_VAR);
6495         z.low = float128_default_nan_low;
6496         z.high = float128_default_nan_high;
6497         return z;
6498     }
6499     if ( aExp == 0 ) {
6500         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6501         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6502     }
6503     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6504     aSig0 |= LIT64( 0x0001000000000000 );
6505     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6506     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6507     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6508     doubleZSig0 = zSig0<<1;
6509     mul64To128( zSig0, zSig0, &term0, &term1 );
6510     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6511     while ( (int64_t) rem0 < 0 ) {
6512         --zSig0;
6513         doubleZSig0 -= 2;
6514         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6515     }
6516     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6517     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6518         if ( zSig1 == 0 ) zSig1 = 1;
6519         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6520         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6521         mul64To128( zSig1, zSig1, &term2, &term3 );
6522         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6523         while ( (int64_t) rem1 < 0 ) {
6524             --zSig1;
6525             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6526             term3 |= 1;
6527             term2 |= doubleZSig0;
6528             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6529         }
6530         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6531     }
6532     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6533     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6534 
6535 }
6536 
6537 /*----------------------------------------------------------------------------
6538 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6539 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6540 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6541 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6542 *----------------------------------------------------------------------------*/
6543 
6544 int float128_eq( float128 a, float128 b STATUS_PARAM )
6545 {
6546 
6547     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6548               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6549          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6550               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6551        ) {
6552         float_raise( float_flag_invalid STATUS_VAR);
6553         return 0;
6554     }
6555     return
6556            ( a.low == b.low )
6557         && (    ( a.high == b.high )
6558              || (    ( a.low == 0 )
6559                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6560            );
6561 
6562 }
6563 
6564 /*----------------------------------------------------------------------------
6565 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6566 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6567 | exception is raised if either operand is a NaN.  The comparison is performed
6568 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6569 *----------------------------------------------------------------------------*/
6570 
6571 int float128_le( float128 a, float128 b STATUS_PARAM )
6572 {
6573     flag aSign, bSign;
6574 
6575     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6576               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6577          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6578               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6579        ) {
6580         float_raise( float_flag_invalid STATUS_VAR);
6581         return 0;
6582     }
6583     aSign = extractFloat128Sign( a );
6584     bSign = extractFloat128Sign( b );
6585     if ( aSign != bSign ) {
6586         return
6587                aSign
6588             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6589                  == 0 );
6590     }
6591     return
6592           aSign ? le128( b.high, b.low, a.high, a.low )
6593         : le128( a.high, a.low, b.high, b.low );
6594 
6595 }
6596 
6597 /*----------------------------------------------------------------------------
6598 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6599 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6600 | raised if either operand is a NaN.  The comparison is performed according
6601 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6602 *----------------------------------------------------------------------------*/
6603 
6604 int float128_lt( float128 a, float128 b STATUS_PARAM )
6605 {
6606     flag aSign, bSign;
6607 
6608     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6609               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6610          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6611               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6612        ) {
6613         float_raise( float_flag_invalid STATUS_VAR);
6614         return 0;
6615     }
6616     aSign = extractFloat128Sign( a );
6617     bSign = extractFloat128Sign( b );
6618     if ( aSign != bSign ) {
6619         return
6620                aSign
6621             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6622                  != 0 );
6623     }
6624     return
6625           aSign ? lt128( b.high, b.low, a.high, a.low )
6626         : lt128( a.high, a.low, b.high, b.low );
6627 
6628 }
6629 
6630 /*----------------------------------------------------------------------------
6631 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6632 | be compared, and 0 otherwise.  The invalid exception is raised if either
6633 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6634 | Standard for Binary Floating-Point Arithmetic.
6635 *----------------------------------------------------------------------------*/
6636 
6637 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6638 {
6639     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6640               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6641          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6642               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6643        ) {
6644         float_raise( float_flag_invalid STATUS_VAR);
6645         return 1;
6646     }
6647     return 0;
6648 }
6649 
6650 /*----------------------------------------------------------------------------
6651 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6652 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6653 | exception.  The comparison is performed according to the IEC/IEEE Standard
6654 | for Binary Floating-Point Arithmetic.
6655 *----------------------------------------------------------------------------*/
6656 
6657 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6658 {
6659 
6660     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6661               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6662          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6663               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6664        ) {
6665         if (    float128_is_signaling_nan( a )
6666              || float128_is_signaling_nan( b ) ) {
6667             float_raise( float_flag_invalid STATUS_VAR);
6668         }
6669         return 0;
6670     }
6671     return
6672            ( a.low == b.low )
6673         && (    ( a.high == b.high )
6674              || (    ( a.low == 0 )
6675                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6676            );
6677 
6678 }
6679 
6680 /*----------------------------------------------------------------------------
6681 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6682 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6683 | cause an exception.  Otherwise, the comparison is performed according to the
6684 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6685 *----------------------------------------------------------------------------*/
6686 
6687 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6688 {
6689     flag aSign, bSign;
6690 
6691     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6692               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6693          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6694               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6695        ) {
6696         if (    float128_is_signaling_nan( a )
6697              || float128_is_signaling_nan( b ) ) {
6698             float_raise( float_flag_invalid STATUS_VAR);
6699         }
6700         return 0;
6701     }
6702     aSign = extractFloat128Sign( a );
6703     bSign = extractFloat128Sign( b );
6704     if ( aSign != bSign ) {
6705         return
6706                aSign
6707             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6708                  == 0 );
6709     }
6710     return
6711           aSign ? le128( b.high, b.low, a.high, a.low )
6712         : le128( a.high, a.low, b.high, b.low );
6713 
6714 }
6715 
6716 /*----------------------------------------------------------------------------
6717 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6718 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6719 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6720 | Standard for Binary Floating-Point Arithmetic.
6721 *----------------------------------------------------------------------------*/
6722 
6723 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6724 {
6725     flag aSign, bSign;
6726 
6727     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6728               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6729          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6730               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6731        ) {
6732         if (    float128_is_signaling_nan( a )
6733              || float128_is_signaling_nan( b ) ) {
6734             float_raise( float_flag_invalid STATUS_VAR);
6735         }
6736         return 0;
6737     }
6738     aSign = extractFloat128Sign( a );
6739     bSign = extractFloat128Sign( b );
6740     if ( aSign != bSign ) {
6741         return
6742                aSign
6743             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6744                  != 0 );
6745     }
6746     return
6747           aSign ? lt128( b.high, b.low, a.high, a.low )
6748         : lt128( a.high, a.low, b.high, b.low );
6749 
6750 }
6751 
6752 /*----------------------------------------------------------------------------
6753 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6754 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6755 | comparison is performed according to the IEC/IEEE Standard for Binary
6756 | Floating-Point Arithmetic.
6757 *----------------------------------------------------------------------------*/
6758 
6759 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6760 {
6761     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6762               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6763          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6764               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6765        ) {
6766         if (    float128_is_signaling_nan( a )
6767              || float128_is_signaling_nan( b ) ) {
6768             float_raise( float_flag_invalid STATUS_VAR);
6769         }
6770         return 1;
6771     }
6772     return 0;
6773 }
6774 
6775 /* misc functions */
6776 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6777 {
6778     return int64_to_float32(a STATUS_VAR);
6779 }
6780 
6781 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6782 {
6783     return int64_to_float64(a STATUS_VAR);
6784 }
6785 
6786 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6787 {
6788     int64_t v;
6789     uint32 res;
6790     int old_exc_flags = get_float_exception_flags(status);
6791 
6792     v = float32_to_int64(a STATUS_VAR);
6793     if (v < 0) {
6794         res = 0;
6795     } else if (v > 0xffffffff) {
6796         res = 0xffffffff;
6797     } else {
6798         return v;
6799     }
6800     set_float_exception_flags(old_exc_flags, status);
6801     float_raise(float_flag_invalid STATUS_VAR);
6802     return res;
6803 }
6804 
6805 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6806 {
6807     int64_t v;
6808     uint32 res;
6809     int old_exc_flags = get_float_exception_flags(status);
6810 
6811     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6812     if (v < 0) {
6813         res = 0;
6814     } else if (v > 0xffffffff) {
6815         res = 0xffffffff;
6816     } else {
6817         return v;
6818     }
6819     set_float_exception_flags(old_exc_flags, status);
6820     float_raise(float_flag_invalid STATUS_VAR);
6821     return res;
6822 }
6823 
6824 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6825 {
6826     int32_t v;
6827     int_fast16_t res;
6828     int old_exc_flags = get_float_exception_flags(status);
6829 
6830     v = float32_to_int32(a STATUS_VAR);
6831     if (v < -0x8000) {
6832         res = -0x8000;
6833     } else if (v > 0x7fff) {
6834         res = 0x7fff;
6835     } else {
6836         return v;
6837     }
6838 
6839     set_float_exception_flags(old_exc_flags, status);
6840     float_raise(float_flag_invalid STATUS_VAR);
6841     return res;
6842 }
6843 
6844 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6845 {
6846     int32_t v;
6847     uint_fast16_t res;
6848     int old_exc_flags = get_float_exception_flags(status);
6849 
6850     v = float32_to_int32(a STATUS_VAR);
6851     if (v < 0) {
6852         res = 0;
6853     } else if (v > 0xffff) {
6854         res = 0xffff;
6855     } else {
6856         return v;
6857     }
6858 
6859     set_float_exception_flags(old_exc_flags, status);
6860     float_raise(float_flag_invalid STATUS_VAR);
6861     return res;
6862 }
6863 
6864 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6865 {
6866     int64_t v;
6867     uint_fast16_t res;
6868     int old_exc_flags = get_float_exception_flags(status);
6869 
6870     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6871     if (v < 0) {
6872         res = 0;
6873     } else if (v > 0xffff) {
6874         res = 0xffff;
6875     } else {
6876         return v;
6877     }
6878     set_float_exception_flags(old_exc_flags, status);
6879     float_raise(float_flag_invalid STATUS_VAR);
6880     return res;
6881 }
6882 
6883 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6884 {
6885     uint64_t v;
6886     uint32 res;
6887     int old_exc_flags = get_float_exception_flags(status);
6888 
6889     v = float64_to_uint64(a STATUS_VAR);
6890     if (v > 0xffffffff) {
6891         res = 0xffffffff;
6892     } else {
6893         return v;
6894     }
6895     set_float_exception_flags(old_exc_flags, status);
6896     float_raise(float_flag_invalid STATUS_VAR);
6897     return res;
6898 }
6899 
6900 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6901 {
6902     uint64_t v;
6903     uint32 res;
6904     int old_exc_flags = get_float_exception_flags(status);
6905 
6906     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6907     if (v > 0xffffffff) {
6908         res = 0xffffffff;
6909     } else {
6910         return v;
6911     }
6912     set_float_exception_flags(old_exc_flags, status);
6913     float_raise(float_flag_invalid STATUS_VAR);
6914     return res;
6915 }
6916 
6917 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6918 {
6919     int64_t v;
6920     int_fast16_t res;
6921     int old_exc_flags = get_float_exception_flags(status);
6922 
6923     v = float64_to_int32(a STATUS_VAR);
6924     if (v < -0x8000) {
6925         res = -0x8000;
6926     } else if (v > 0x7fff) {
6927         res = 0x7fff;
6928     } else {
6929         return v;
6930     }
6931 
6932     set_float_exception_flags(old_exc_flags, status);
6933     float_raise(float_flag_invalid STATUS_VAR);
6934     return res;
6935 }
6936 
6937 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6938 {
6939     int64_t v;
6940     uint_fast16_t res;
6941     int old_exc_flags = get_float_exception_flags(status);
6942 
6943     v = float64_to_int32(a STATUS_VAR);
6944     if (v < 0) {
6945         res = 0;
6946     } else if (v > 0xffff) {
6947         res = 0xffff;
6948     } else {
6949         return v;
6950     }
6951 
6952     set_float_exception_flags(old_exc_flags, status);
6953     float_raise(float_flag_invalid STATUS_VAR);
6954     return res;
6955 }
6956 
6957 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6958 {
6959     int64_t v;
6960     uint_fast16_t res;
6961     int old_exc_flags = get_float_exception_flags(status);
6962 
6963     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6964     if (v < 0) {
6965         res = 0;
6966     } else if (v > 0xffff) {
6967         res = 0xffff;
6968     } else {
6969         return v;
6970     }
6971     set_float_exception_flags(old_exc_flags, status);
6972     float_raise(float_flag_invalid STATUS_VAR);
6973     return res;
6974 }
6975 
6976 /*----------------------------------------------------------------------------
6977 | Returns the result of converting the double-precision floating-point value
6978 | `a' to the 64-bit unsigned integer format.  The conversion is
6979 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6980 | Arithmetic---which means in particular that the conversion is rounded
6981 | according to the current rounding mode.  If `a' is a NaN, the largest
6982 | positive integer is returned.  If the conversion overflows, the
6983 | largest unsigned integer is returned.  If 'a' is negative, the value is
6984 | rounded and zero is returned; negative values that do not round to zero
6985 | will raise the inexact exception.
6986 *----------------------------------------------------------------------------*/
6987 
6988 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6989 {
6990     flag aSign;
6991     int_fast16_t aExp, shiftCount;
6992     uint64_t aSig, aSigExtra;
6993     a = float64_squash_input_denormal(a STATUS_VAR);
6994 
6995     aSig = extractFloat64Frac(a);
6996     aExp = extractFloat64Exp(a);
6997     aSign = extractFloat64Sign(a);
6998     if (aSign && (aExp > 1022)) {
6999         float_raise(float_flag_invalid STATUS_VAR);
7000         if (float64_is_any_nan(a)) {
7001             return LIT64(0xFFFFFFFFFFFFFFFF);
7002         } else {
7003             return 0;
7004         }
7005     }
7006     if (aExp) {
7007         aSig |= LIT64(0x0010000000000000);
7008     }
7009     shiftCount = 0x433 - aExp;
7010     if (shiftCount <= 0) {
7011         if (0x43E < aExp) {
7012             float_raise(float_flag_invalid STATUS_VAR);
7013             return LIT64(0xFFFFFFFFFFFFFFFF);
7014         }
7015         aSigExtra = 0;
7016         aSig <<= -shiftCount;
7017     } else {
7018         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7019     }
7020     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
7021 }
7022 
7023 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7024 {
7025     signed char current_rounding_mode = STATUS(float_rounding_mode);
7026     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7027     int64_t v = float64_to_uint64(a STATUS_VAR);
7028     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7029     return v;
7030 }
7031 
7032 #define COMPARE(s, nan_exp)                                                  \
7033 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
7034                                       int is_quiet STATUS_PARAM )            \
7035 {                                                                            \
7036     flag aSign, bSign;                                                       \
7037     uint ## s ## _t av, bv;                                                  \
7038     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
7039     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
7040                                                                              \
7041     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
7042          extractFloat ## s ## Frac( a ) ) ||                                 \
7043         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
7044           extractFloat ## s ## Frac( b ) )) {                                \
7045         if (!is_quiet ||                                                     \
7046             float ## s ## _is_signaling_nan( a ) ||                          \
7047             float ## s ## _is_signaling_nan( b ) ) {                         \
7048             float_raise( float_flag_invalid STATUS_VAR);                     \
7049         }                                                                    \
7050         return float_relation_unordered;                                     \
7051     }                                                                        \
7052     aSign = extractFloat ## s ## Sign( a );                                  \
7053     bSign = extractFloat ## s ## Sign( b );                                  \
7054     av = float ## s ## _val(a);                                              \
7055     bv = float ## s ## _val(b);                                              \
7056     if ( aSign != bSign ) {                                                  \
7057         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
7058             /* zero case */                                                  \
7059             return float_relation_equal;                                     \
7060         } else {                                                             \
7061             return 1 - (2 * aSign);                                          \
7062         }                                                                    \
7063     } else {                                                                 \
7064         if (av == bv) {                                                      \
7065             return float_relation_equal;                                     \
7066         } else {                                                             \
7067             return 1 - 2 * (aSign ^ ( av < bv ));                            \
7068         }                                                                    \
7069     }                                                                        \
7070 }                                                                            \
7071                                                                              \
7072 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
7073 {                                                                            \
7074     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
7075 }                                                                            \
7076                                                                              \
7077 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
7078 {                                                                            \
7079     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
7080 }
7081 
7082 COMPARE(32, 0xff)
7083 COMPARE(64, 0x7ff)
7084 
7085 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
7086                                       int is_quiet STATUS_PARAM )
7087 {
7088     flag aSign, bSign;
7089 
7090     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7091           ( extractFloatx80Frac( a )<<1 ) ) ||
7092         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7093           ( extractFloatx80Frac( b )<<1 ) )) {
7094         if (!is_quiet ||
7095             floatx80_is_signaling_nan( a ) ||
7096             floatx80_is_signaling_nan( b ) ) {
7097             float_raise( float_flag_invalid STATUS_VAR);
7098         }
7099         return float_relation_unordered;
7100     }
7101     aSign = extractFloatx80Sign( a );
7102     bSign = extractFloatx80Sign( b );
7103     if ( aSign != bSign ) {
7104 
7105         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7106              ( ( a.low | b.low ) == 0 ) ) {
7107             /* zero case */
7108             return float_relation_equal;
7109         } else {
7110             return 1 - (2 * aSign);
7111         }
7112     } else {
7113         if (a.low == b.low && a.high == b.high) {
7114             return float_relation_equal;
7115         } else {
7116             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7117         }
7118     }
7119 }
7120 
7121 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7122 {
7123     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7124 }
7125 
7126 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7127 {
7128     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7129 }
7130 
7131 INLINE int float128_compare_internal( float128 a, float128 b,
7132                                       int is_quiet STATUS_PARAM )
7133 {
7134     flag aSign, bSign;
7135 
7136     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7137           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7138         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7139           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7140         if (!is_quiet ||
7141             float128_is_signaling_nan( a ) ||
7142             float128_is_signaling_nan( b ) ) {
7143             float_raise( float_flag_invalid STATUS_VAR);
7144         }
7145         return float_relation_unordered;
7146     }
7147     aSign = extractFloat128Sign( a );
7148     bSign = extractFloat128Sign( b );
7149     if ( aSign != bSign ) {
7150         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7151             /* zero case */
7152             return float_relation_equal;
7153         } else {
7154             return 1 - (2 * aSign);
7155         }
7156     } else {
7157         if (a.low == b.low && a.high == b.high) {
7158             return float_relation_equal;
7159         } else {
7160             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7161         }
7162     }
7163 }
7164 
7165 int float128_compare( float128 a, float128 b STATUS_PARAM )
7166 {
7167     return float128_compare_internal(a, b, 0 STATUS_VAR);
7168 }
7169 
7170 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7171 {
7172     return float128_compare_internal(a, b, 1 STATUS_VAR);
7173 }
7174 
7175 /* min() and max() functions. These can't be implemented as
7176  * 'compare and pick one input' because that would mishandle
7177  * NaNs and +0 vs -0.
7178  *
7179  * minnum() and maxnum() functions. These are similar to the min()
7180  * and max() functions but if one of the arguments is a QNaN and
7181  * the other is numerical then the numerical argument is returned.
7182  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7183  * and maxNum() operations. min() and max() are the typical min/max
7184  * semantics provided by many CPUs which predate that specification.
7185  */
7186 #define MINMAX(s)                                                       \
7187 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7188                                         int ismin, int isieee STATUS_PARAM) \
7189 {                                                                       \
7190     flag aSign, bSign;                                                  \
7191     uint ## s ## _t av, bv;                                             \
7192     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7193     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7194     if (float ## s ## _is_any_nan(a) ||                                 \
7195         float ## s ## _is_any_nan(b)) {                                 \
7196         if (isieee) {                                                   \
7197             if (float ## s ## _is_quiet_nan(a) &&                       \
7198                 !float ## s ##_is_any_nan(b)) {                         \
7199                 return b;                                               \
7200             } else if (float ## s ## _is_quiet_nan(b) &&                \
7201                        !float ## s ## _is_any_nan(a)) {                 \
7202                 return a;                                               \
7203             }                                                           \
7204         }                                                               \
7205         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7206     }                                                                   \
7207     aSign = extractFloat ## s ## Sign(a);                               \
7208     bSign = extractFloat ## s ## Sign(b);                               \
7209     av = float ## s ## _val(a);                                         \
7210     bv = float ## s ## _val(b);                                         \
7211     if (aSign != bSign) {                                               \
7212         if (ismin) {                                                    \
7213             return aSign ? a : b;                                       \
7214         } else {                                                        \
7215             return aSign ? b : a;                                       \
7216         }                                                               \
7217     } else {                                                            \
7218         if (ismin) {                                                    \
7219             return (aSign ^ (av < bv)) ? a : b;                         \
7220         } else {                                                        \
7221             return (aSign ^ (av < bv)) ? b : a;                         \
7222         }                                                               \
7223     }                                                                   \
7224 }                                                                       \
7225                                                                         \
7226 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7227 {                                                                       \
7228     return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
7229 }                                                                       \
7230                                                                         \
7231 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7232 {                                                                       \
7233     return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7234 }                                                                       \
7235                                                                         \
7236 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7237 {                                                                       \
7238     return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7239 }                                                                       \
7240                                                                         \
7241 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7242 {                                                                       \
7243     return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7244 }
7245 
7246 MINMAX(32)
7247 MINMAX(64)
7248 
7249 
7250 /* Multiply A by 2 raised to the power N.  */
7251 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7252 {
7253     flag aSign;
7254     int16_t aExp;
7255     uint32_t aSig;
7256 
7257     a = float32_squash_input_denormal(a STATUS_VAR);
7258     aSig = extractFloat32Frac( a );
7259     aExp = extractFloat32Exp( a );
7260     aSign = extractFloat32Sign( a );
7261 
7262     if ( aExp == 0xFF ) {
7263         if ( aSig ) {
7264             return propagateFloat32NaN( a, a STATUS_VAR );
7265         }
7266         return a;
7267     }
7268     if (aExp != 0) {
7269         aSig |= 0x00800000;
7270     } else if (aSig == 0) {
7271         return a;
7272     } else {
7273         aExp++;
7274     }
7275 
7276     if (n > 0x200) {
7277         n = 0x200;
7278     } else if (n < -0x200) {
7279         n = -0x200;
7280     }
7281 
7282     aExp += n - 1;
7283     aSig <<= 7;
7284     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7285 }
7286 
7287 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7288 {
7289     flag aSign;
7290     int16_t aExp;
7291     uint64_t aSig;
7292 
7293     a = float64_squash_input_denormal(a STATUS_VAR);
7294     aSig = extractFloat64Frac( a );
7295     aExp = extractFloat64Exp( a );
7296     aSign = extractFloat64Sign( a );
7297 
7298     if ( aExp == 0x7FF ) {
7299         if ( aSig ) {
7300             return propagateFloat64NaN( a, a STATUS_VAR );
7301         }
7302         return a;
7303     }
7304     if (aExp != 0) {
7305         aSig |= LIT64( 0x0010000000000000 );
7306     } else if (aSig == 0) {
7307         return a;
7308     } else {
7309         aExp++;
7310     }
7311 
7312     if (n > 0x1000) {
7313         n = 0x1000;
7314     } else if (n < -0x1000) {
7315         n = -0x1000;
7316     }
7317 
7318     aExp += n - 1;
7319     aSig <<= 10;
7320     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7321 }
7322 
7323 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7324 {
7325     flag aSign;
7326     int32_t aExp;
7327     uint64_t aSig;
7328 
7329     aSig = extractFloatx80Frac( a );
7330     aExp = extractFloatx80Exp( a );
7331     aSign = extractFloatx80Sign( a );
7332 
7333     if ( aExp == 0x7FFF ) {
7334         if ( aSig<<1 ) {
7335             return propagateFloatx80NaN( a, a STATUS_VAR );
7336         }
7337         return a;
7338     }
7339 
7340     if (aExp == 0) {
7341         if (aSig == 0) {
7342             return a;
7343         }
7344         aExp++;
7345     }
7346 
7347     if (n > 0x10000) {
7348         n = 0x10000;
7349     } else if (n < -0x10000) {
7350         n = -0x10000;
7351     }
7352 
7353     aExp += n;
7354     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7355                                           aSign, aExp, aSig, 0 STATUS_VAR );
7356 }
7357 
7358 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7359 {
7360     flag aSign;
7361     int32_t aExp;
7362     uint64_t aSig0, aSig1;
7363 
7364     aSig1 = extractFloat128Frac1( a );
7365     aSig0 = extractFloat128Frac0( a );
7366     aExp = extractFloat128Exp( a );
7367     aSign = extractFloat128Sign( a );
7368     if ( aExp == 0x7FFF ) {
7369         if ( aSig0 | aSig1 ) {
7370             return propagateFloat128NaN( a, a STATUS_VAR );
7371         }
7372         return a;
7373     }
7374     if (aExp != 0) {
7375         aSig0 |= LIT64( 0x0001000000000000 );
7376     } else if (aSig0 == 0 && aSig1 == 0) {
7377         return a;
7378     } else {
7379         aExp++;
7380     }
7381 
7382     if (n > 0x10000) {
7383         n = 0x10000;
7384     } else if (n < -0x10000) {
7385         n = -0x10000;
7386     }
7387 
7388     aExp += n - 1;
7389     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7390                                           STATUS_VAR );
7391 
7392 }
7393